• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
11 // selection DAG.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86MachineFunctionInfo.h"
21 #include "X86ShuffleDecodeConstantPool.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
24 #include "llvm/ADT/SmallBitVector.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
29 #include "llvm/Analysis/EHPersonalities.h"
30 #include "llvm/CodeGen/IntrinsicLowering.h"
31 #include "llvm/CodeGen/MachineFrameInfo.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
35 #include "llvm/CodeGen/MachineModuleInfo.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/WinEHFuncInfo.h"
38 #include "llvm/IR/CallSite.h"
39 #include "llvm/IR/CallingConv.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DerivedTypes.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/GlobalAlias.h"
44 #include "llvm/IR/GlobalVariable.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/Intrinsics.h"
47 #include "llvm/MC/MCAsmInfo.h"
48 #include "llvm/MC/MCContext.h"
49 #include "llvm/MC/MCExpr.h"
50 #include "llvm/MC/MCSymbol.h"
51 #include "llvm/Support/CommandLine.h"
52 #include "llvm/Support/Debug.h"
53 #include "llvm/Support/ErrorHandling.h"
54 #include "llvm/Support/MathExtras.h"
55 #include "llvm/Target/TargetOptions.h"
56 #include "X86IntrinsicsInfo.h"
57 #include <bitset>
58 #include <numeric>
59 #include <cctype>
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "x86-isel"
63 
64 STATISTIC(NumTailCalls, "Number of tail calls");
65 
66 static cl::opt<bool> ExperimentalVectorWideningLegalization(
67     "x86-experimental-vector-widening-legalization", cl::init(false),
68     cl::desc("Enable an experimental vector type legalization through widening "
69              "rather than promotion."),
70     cl::Hidden);
71 
X86TargetLowering(const X86TargetMachine & TM,const X86Subtarget & STI)72 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
73                                      const X86Subtarget &STI)
74     : TargetLowering(TM), Subtarget(STI) {
75   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
76   X86ScalarSSEf64 = Subtarget.hasSSE2();
77   X86ScalarSSEf32 = Subtarget.hasSSE1();
78   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
79 
80   // Set up the TargetLowering object.
81 
82   // X86 is weird. It always uses i8 for shift amounts and setcc results.
83   setBooleanContents(ZeroOrOneBooleanContent);
84   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
85   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
86 
87   // For 64-bit, since we have so many registers, use the ILP scheduler.
88   // For 32-bit, use the register pressure specific scheduling.
89   // For Atom, always use ILP scheduling.
90   if (Subtarget.isAtom())
91     setSchedulingPreference(Sched::ILP);
92   else if (Subtarget.is64Bit())
93     setSchedulingPreference(Sched::ILP);
94   else
95     setSchedulingPreference(Sched::RegPressure);
96   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
97   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
98 
99   // Bypass expensive divides on Atom when compiling with O2.
100   if (TM.getOptLevel() >= CodeGenOpt::Default) {
101     if (Subtarget.hasSlowDivide32())
102       addBypassSlowDiv(32, 8);
103     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
104       addBypassSlowDiv(64, 16);
105   }
106 
107   if (Subtarget.isTargetKnownWindowsMSVC()) {
108     // Setup Windows compiler runtime calls.
109     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
110     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
111     setLibcallName(RTLIB::SREM_I64, "_allrem");
112     setLibcallName(RTLIB::UREM_I64, "_aullrem");
113     setLibcallName(RTLIB::MUL_I64, "_allmul");
114     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
115     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
116     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
117     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
118     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
119   }
120 
121   if (Subtarget.isTargetDarwin()) {
122     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
123     setUseUnderscoreSetJmp(false);
124     setUseUnderscoreLongJmp(false);
125   } else if (Subtarget.isTargetWindowsGNU()) {
126     // MS runtime is weird: it exports _setjmp, but longjmp!
127     setUseUnderscoreSetJmp(true);
128     setUseUnderscoreLongJmp(false);
129   } else {
130     setUseUnderscoreSetJmp(true);
131     setUseUnderscoreLongJmp(true);
132   }
133 
134   // Set up the register classes.
135   addRegisterClass(MVT::i8, &X86::GR8RegClass);
136   addRegisterClass(MVT::i16, &X86::GR16RegClass);
137   addRegisterClass(MVT::i32, &X86::GR32RegClass);
138   if (Subtarget.is64Bit())
139     addRegisterClass(MVT::i64, &X86::GR64RegClass);
140 
141   for (MVT VT : MVT::integer_valuetypes())
142     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
143 
144   // We don't accept any truncstore of integer registers.
145   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
146   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
147   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
148   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
149   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
150   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
151 
152   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
153 
154   // SETOEQ and SETUNE require checking two conditions.
155   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
156   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
157   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
158   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
159   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
160   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
161 
162   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
163   // operation.
164   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
165   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
166   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
167 
168   if (Subtarget.is64Bit()) {
169     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
170       // f32/f64 are legal, f80 is custom.
171       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
172     else
173       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
174     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
175   } else if (!Subtarget.useSoftFloat()) {
176     // We have an algorithm for SSE2->double, and we turn this into a
177     // 64-bit FILD followed by conditional FADD for other targets.
178     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
179     // We have an algorithm for SSE2, and we turn this into a 64-bit
180     // FILD or VCVTUSI2SS/SD for other targets.
181     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
182   }
183 
184   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
185   // this operation.
186   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
187   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
188 
189   if (!Subtarget.useSoftFloat()) {
190     // SSE has no i16 to fp conversion, only i32
191     if (X86ScalarSSEf32) {
192       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
193       // f32 and f64 cases are Legal, f80 case is not
194       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
195     } else {
196       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
197       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
198     }
199   } else {
200     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
201     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
202   }
203 
204   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
205   // this operation.
206   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
207   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
208 
209   if (!Subtarget.useSoftFloat()) {
210     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
211     // are Legal, f80 is custom lowered.
212     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
213     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
214 
215     if (X86ScalarSSEf32) {
216       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
217       // f32 and f64 cases are Legal, f80 case is not
218       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
219     } else {
220       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
221       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
222     }
223   } else {
224     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
225     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
226     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
227   }
228 
229   // Handle FP_TO_UINT by promoting the destination to a larger signed
230   // conversion.
231   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
232   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
233   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
234 
235   if (Subtarget.is64Bit()) {
236     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
237       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
238       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
239       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
240     } else {
241       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
242       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
243     }
244   } else if (!Subtarget.useSoftFloat()) {
245     // Since AVX is a superset of SSE3, only check for SSE here.
246     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
247       // Expand FP_TO_UINT into a select.
248       // FIXME: We would like to use a Custom expander here eventually to do
249       // the optimal thing for SSE vs. the default expansion in the legalizer.
250       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
251     else
252       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
253       // With SSE3 we can use fisttpll to convert to a signed i64; without
254       // SSE, we're stuck with a fistpll.
255       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
256 
257     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
258   }
259 
260   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
261   if (!X86ScalarSSEf64) {
262     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
263     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
264     if (Subtarget.is64Bit()) {
265       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
266       // Without SSE, i64->f64 goes through memory.
267       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
268     }
269   } else if (!Subtarget.is64Bit())
270     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
271 
272   // Scalar integer divide and remainder are lowered to use operations that
273   // produce two results, to match the available instructions. This exposes
274   // the two-result form to trivial CSE, which is able to combine x/y and x%y
275   // into a single instruction.
276   //
277   // Scalar integer multiply-high is also lowered to use two-result
278   // operations, to match the available instructions. However, plain multiply
279   // (low) operations are left as Legal, as there are single-result
280   // instructions for this in x86. Using the two-result multiply instructions
281   // when both high and low results are needed must be arranged by dagcombine.
282   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
283     setOperationAction(ISD::MULHS, VT, Expand);
284     setOperationAction(ISD::MULHU, VT, Expand);
285     setOperationAction(ISD::SDIV, VT, Expand);
286     setOperationAction(ISD::UDIV, VT, Expand);
287     setOperationAction(ISD::SREM, VT, Expand);
288     setOperationAction(ISD::UREM, VT, Expand);
289 
290     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
291     setOperationAction(ISD::ADDC, VT, Custom);
292     setOperationAction(ISD::ADDE, VT, Custom);
293     setOperationAction(ISD::SUBC, VT, Custom);
294     setOperationAction(ISD::SUBE, VT, Custom);
295   }
296 
297   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
298   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
299   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
300                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
301     setOperationAction(ISD::BR_CC,     VT, Expand);
302     setOperationAction(ISD::SELECT_CC, VT, Expand);
303   }
304   if (Subtarget.is64Bit())
305     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
306   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
307   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
308   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
309   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
310 
311   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
312   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
313   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
314   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
315 
316   // Promote the i8 variants and force them on up to i32 which has a shorter
317   // encoding.
318   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
319   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
320   if (!Subtarget.hasBMI()) {
321     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
322     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
323     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
324     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
325     if (Subtarget.is64Bit()) {
326       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
327       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
328     }
329   }
330 
331   if (Subtarget.hasLZCNT()) {
332     // When promoting the i8 variants, force them to i32 for a shorter
333     // encoding.
334     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
335     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
336   } else {
337     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
338     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
339     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
340     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
341     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
342     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
343     if (Subtarget.is64Bit()) {
344       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
345       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
346     }
347   }
348 
349   // Special handling for half-precision floating point conversions.
350   // If we don't have F16C support, then lower half float conversions
351   // into library calls.
352   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
353     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
354     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
355   }
356 
357   // There's never any support for operations beyond MVT::f32.
358   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
359   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
360   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
361   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
362 
363   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
364   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
365   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
366   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
367   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
368   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
369 
370   if (Subtarget.hasPOPCNT()) {
371     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
372   } else {
373     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
374     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
375     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
376     if (Subtarget.is64Bit())
377       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
378   }
379 
380   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
381 
382   if (!Subtarget.hasMOVBE())
383     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
384 
385   // These should be promoted to a larger select which is supported.
386   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
387   // X86 wants to expand cmov itself.
388   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
389     setOperationAction(ISD::SELECT, VT, Custom);
390     setOperationAction(ISD::SETCC, VT, Custom);
391   }
392   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
393     if (VT == MVT::i64 && !Subtarget.is64Bit())
394       continue;
395     setOperationAction(ISD::SELECT, VT, Custom);
396     setOperationAction(ISD::SETCC,  VT, Custom);
397     setOperationAction(ISD::SETCCE, VT, Custom);
398   }
399   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
400   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
401   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
402   // support continuation, user-level threading, and etc.. As a result, no
403   // other SjLj exception interfaces are implemented and please don't build
404   // your own exception handling based on them.
405   // LLVM/Clang supports zero-cost DWARF exception handling.
406   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
407   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
408   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
409   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
410     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
411 
412   // Darwin ABI issue.
413   for (auto VT : { MVT::i32, MVT::i64 }) {
414     if (VT == MVT::i64 && !Subtarget.is64Bit())
415       continue;
416     setOperationAction(ISD::ConstantPool    , VT, Custom);
417     setOperationAction(ISD::JumpTable       , VT, Custom);
418     setOperationAction(ISD::GlobalAddress   , VT, Custom);
419     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
420     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
421     setOperationAction(ISD::BlockAddress    , VT, Custom);
422   }
423   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
424   for (auto VT : { MVT::i32, MVT::i64 }) {
425     if (VT == MVT::i64 && !Subtarget.is64Bit())
426       continue;
427     setOperationAction(ISD::SHL_PARTS, VT, Custom);
428     setOperationAction(ISD::SRA_PARTS, VT, Custom);
429     setOperationAction(ISD::SRL_PARTS, VT, Custom);
430   }
431 
432   if (Subtarget.hasSSE1())
433     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
434 
435   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
436 
437   // Expand certain atomics
438   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
439     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
440     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
441     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
442     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
443     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
444     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
445     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
446   }
447 
448   if (Subtarget.hasCmpxchg16b()) {
449     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
450   }
451 
452   // FIXME - use subtarget debug flags
453   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
454       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
455       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
456     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
457   }
458 
459   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
460   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
461 
462   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
463   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
464 
465   setOperationAction(ISD::TRAP, MVT::Other, Legal);
466   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
467 
468   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
469   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
470   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
471   bool Is64Bit = Subtarget.is64Bit();
472   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
473   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
474 
475   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
476   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
477 
478   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
479 
480   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
481   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
482   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
483 
484   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
485     // f32 and f64 use SSE.
486     // Set up the FP register classes.
487     addRegisterClass(MVT::f32, &X86::FR32RegClass);
488     addRegisterClass(MVT::f64, &X86::FR64RegClass);
489 
490     for (auto VT : { MVT::f32, MVT::f64 }) {
491       // Use ANDPD to simulate FABS.
492       setOperationAction(ISD::FABS, VT, Custom);
493 
494       // Use XORP to simulate FNEG.
495       setOperationAction(ISD::FNEG, VT, Custom);
496 
497       // Use ANDPD and ORPD to simulate FCOPYSIGN.
498       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
499 
500       // We don't support sin/cos/fmod
501       setOperationAction(ISD::FSIN   , VT, Expand);
502       setOperationAction(ISD::FCOS   , VT, Expand);
503       setOperationAction(ISD::FSINCOS, VT, Expand);
504     }
505 
506     // Lower this to MOVMSK plus an AND.
507     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
508     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
509 
510     // Expand FP immediates into loads from the stack, except for the special
511     // cases we handle.
512     addLegalFPImmediate(APFloat(+0.0)); // xorpd
513     addLegalFPImmediate(APFloat(+0.0f)); // xorps
514   } else if (UseX87 && X86ScalarSSEf32) {
515     // Use SSE for f32, x87 for f64.
516     // Set up the FP register classes.
517     addRegisterClass(MVT::f32, &X86::FR32RegClass);
518     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
519 
520     // Use ANDPS to simulate FABS.
521     setOperationAction(ISD::FABS , MVT::f32, Custom);
522 
523     // Use XORP to simulate FNEG.
524     setOperationAction(ISD::FNEG , MVT::f32, Custom);
525 
526     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
527 
528     // Use ANDPS and ORPS to simulate FCOPYSIGN.
529     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
530     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
531 
532     // We don't support sin/cos/fmod
533     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
534     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
535     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
536 
537     // Special cases we handle for FP constants.
538     addLegalFPImmediate(APFloat(+0.0f)); // xorps
539     addLegalFPImmediate(APFloat(+0.0)); // FLD0
540     addLegalFPImmediate(APFloat(+1.0)); // FLD1
541     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
542     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
543 
544     if (!TM.Options.UnsafeFPMath) {
545       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
546       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
547       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
548     }
549   } else if (UseX87) {
550     // f32 and f64 in x87.
551     // Set up the FP register classes.
552     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
553     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
554 
555     for (auto VT : { MVT::f32, MVT::f64 }) {
556       setOperationAction(ISD::UNDEF,     VT, Expand);
557       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
558 
559       if (!TM.Options.UnsafeFPMath) {
560         setOperationAction(ISD::FSIN   , VT, Expand);
561         setOperationAction(ISD::FCOS   , VT, Expand);
562         setOperationAction(ISD::FSINCOS, VT, Expand);
563       }
564     }
565     addLegalFPImmediate(APFloat(+0.0)); // FLD0
566     addLegalFPImmediate(APFloat(+1.0)); // FLD1
567     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
568     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
569     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
570     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
571     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
572     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
573   }
574 
575   // We don't support FMA.
576   setOperationAction(ISD::FMA, MVT::f64, Expand);
577   setOperationAction(ISD::FMA, MVT::f32, Expand);
578 
579   // Long double always uses X87, except f128 in MMX.
580   if (UseX87) {
581     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
582       addRegisterClass(MVT::f128, &X86::FR128RegClass);
583       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
584       setOperationAction(ISD::FABS , MVT::f128, Custom);
585       setOperationAction(ISD::FNEG , MVT::f128, Custom);
586       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
587     }
588 
589     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
590     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
591     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
592     {
593       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
594       addLegalFPImmediate(TmpFlt);  // FLD0
595       TmpFlt.changeSign();
596       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
597 
598       bool ignored;
599       APFloat TmpFlt2(+1.0);
600       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
601                       &ignored);
602       addLegalFPImmediate(TmpFlt2);  // FLD1
603       TmpFlt2.changeSign();
604       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
605     }
606 
607     if (!TM.Options.UnsafeFPMath) {
608       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
609       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
610       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
611     }
612 
613     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
614     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
615     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
616     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
617     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
618     setOperationAction(ISD::FMA, MVT::f80, Expand);
619   }
620 
621   // Always use a library call for pow.
622   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
623   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
624   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
625 
626   setOperationAction(ISD::FLOG, MVT::f80, Expand);
627   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
628   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
629   setOperationAction(ISD::FEXP, MVT::f80, Expand);
630   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
631   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
632   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
633 
634   // Some FP actions are always expanded for vector types.
635   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
636                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
637     setOperationAction(ISD::FSIN,      VT, Expand);
638     setOperationAction(ISD::FSINCOS,   VT, Expand);
639     setOperationAction(ISD::FCOS,      VT, Expand);
640     setOperationAction(ISD::FREM,      VT, Expand);
641     setOperationAction(ISD::FPOWI,     VT, Expand);
642     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
643     setOperationAction(ISD::FPOW,      VT, Expand);
644     setOperationAction(ISD::FLOG,      VT, Expand);
645     setOperationAction(ISD::FLOG2,     VT, Expand);
646     setOperationAction(ISD::FLOG10,    VT, Expand);
647     setOperationAction(ISD::FEXP,      VT, Expand);
648     setOperationAction(ISD::FEXP2,     VT, Expand);
649   }
650 
651   // First set operation action for all vector types to either promote
652   // (for widening) or expand (for scalarization). Then we will selectively
653   // turn on ones that can be effectively codegen'd.
654   for (MVT VT : MVT::vector_valuetypes()) {
655     setOperationAction(ISD::SDIV, VT, Expand);
656     setOperationAction(ISD::UDIV, VT, Expand);
657     setOperationAction(ISD::SREM, VT, Expand);
658     setOperationAction(ISD::UREM, VT, Expand);
659     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
660     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
661     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
662     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
663     setOperationAction(ISD::FMA,  VT, Expand);
664     setOperationAction(ISD::FFLOOR, VT, Expand);
665     setOperationAction(ISD::FCEIL, VT, Expand);
666     setOperationAction(ISD::FTRUNC, VT, Expand);
667     setOperationAction(ISD::FRINT, VT, Expand);
668     setOperationAction(ISD::FNEARBYINT, VT, Expand);
669     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
670     setOperationAction(ISD::MULHS, VT, Expand);
671     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
672     setOperationAction(ISD::MULHU, VT, Expand);
673     setOperationAction(ISD::SDIVREM, VT, Expand);
674     setOperationAction(ISD::UDIVREM, VT, Expand);
675     setOperationAction(ISD::CTPOP, VT, Expand);
676     setOperationAction(ISD::CTTZ, VT, Expand);
677     setOperationAction(ISD::CTLZ, VT, Expand);
678     setOperationAction(ISD::ROTL, VT, Expand);
679     setOperationAction(ISD::ROTR, VT, Expand);
680     setOperationAction(ISD::BSWAP, VT, Expand);
681     setOperationAction(ISD::SETCC, VT, Expand);
682     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
683     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
684     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
685     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
686     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
687     setOperationAction(ISD::TRUNCATE, VT, Expand);
688     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
689     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
690     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
691     setOperationAction(ISD::SELECT_CC, VT, Expand);
692     for (MVT InnerVT : MVT::vector_valuetypes()) {
693       setTruncStoreAction(InnerVT, VT, Expand);
694 
695       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
696       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
697 
698       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
699       // types, we have to deal with them whether we ask for Expansion or not.
700       // Setting Expand causes its own optimisation problems though, so leave
701       // them legal.
702       if (VT.getVectorElementType() == MVT::i1)
703         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
704 
705       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
706       // split/scalarized right now.
707       if (VT.getVectorElementType() == MVT::f16)
708         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
709     }
710   }
711 
712   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
713   // with -msoft-float, disable use of MMX as well.
714   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
715     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
716     // No operations on x86mmx supported, everything uses intrinsics.
717   }
718 
719   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
720     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
721 
722     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
723     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
724     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
725     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
726     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
727     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
728     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
729     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
730   }
731 
732   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
733     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
734 
735     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
736     // registers cannot be used even for integer operations.
737     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
738     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
739     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
740     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
741 
742     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
743     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
744     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
745     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
746     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
747     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
748     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
749     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
750     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
751     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
752     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
753     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
754 
755     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
756     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
757     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
758     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
759 
760     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
761     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
762     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
763     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
764 
765     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
766     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
767     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
768     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
769     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
770 
771     setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
772     setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
773     setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
774     setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
775 
776     setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
777     setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
778     setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
779     // ISD::CTTZ v2i64 - scalarization is faster.
780 
781     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
782     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
783       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
784       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
785       setOperationAction(ISD::VSELECT,            VT, Custom);
786       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
787     }
788 
789     // We support custom legalizing of sext and anyext loads for specific
790     // memory vector types which we can load as a scalar (or sequence of
791     // scalars) and extend in-register to a legal 128-bit vector type. For sext
792     // loads these must work with a single scalar load.
793     for (MVT VT : MVT::integer_vector_valuetypes()) {
794       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
795       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
796       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
797       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
798       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
799       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
800       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
801       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
802       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
803     }
804 
805     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
806       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
807       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
808       setOperationAction(ISD::VSELECT,            VT, Custom);
809 
810       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
811         continue;
812 
813       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
814       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
815     }
816 
817     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
818     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
819       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
820       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
821       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
822       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
823       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
824     }
825 
826     // Custom lower v2i64 and v2f64 selects.
827     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
828     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
829 
830     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
831     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
832 
833     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
834 
835     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
836     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
837     // As there is no 64-bit GPR available, we need build a special custom
838     // sequence to convert from v2i32 to v2f32.
839     if (!Subtarget.is64Bit())
840       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
841 
842     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
843     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
844 
845     for (MVT VT : MVT::fp_vector_valuetypes())
846       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
847 
848     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
849     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
850     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
851 
852     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
853     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
854     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
855 
856     for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
857       setOperationAction(ISD::SRL, VT, Custom);
858       setOperationAction(ISD::SHL, VT, Custom);
859       setOperationAction(ISD::SRA, VT, Custom);
860     }
861 
862     // In the customized shift lowering, the legal cases in AVX2 will be
863     // recognized.
864     for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
865       setOperationAction(ISD::SRL, VT, Custom);
866       setOperationAction(ISD::SHL, VT, Custom);
867       setOperationAction(ISD::SRA, VT, Custom);
868     }
869   }
870 
871   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
872     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
873     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
874     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
875     // ISD::CTLZ v4i32 - scalarization is faster.
876     // ISD::CTLZ v2i64 - scalarization is faster.
877   }
878 
879   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
880     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
881       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
882       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
883       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
884       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
885       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
886     }
887 
888     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
889     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
890     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
891     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
892     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
893     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
894     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
895     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
896 
897     // FIXME: Do we need to handle scalar-to-vector here?
898     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
899 
900     // We directly match byte blends in the backend as they match the VSELECT
901     // condition form.
902     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
903 
904     // SSE41 brings specific instructions for doing vector sign extend even in
905     // cases where we don't have SRA.
906     for (MVT VT : MVT::integer_vector_valuetypes()) {
907       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
908       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
909       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
910     }
911 
912     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
913     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
914     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
915     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
916     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
917     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
918     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
919 
920     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
921     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
922     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
923     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
924     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
925     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
926 
927     // i8 vectors are custom because the source register and source
928     // source memory operand types are not the same width.
929     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
930   }
931 
932   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
933     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
934                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
935       setOperationAction(ISD::ROTL, VT, Custom);
936 
937     // XOP can efficiently perform BITREVERSE with VPPERM.
938     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
939       setOperationAction(ISD::BITREVERSE, VT, Custom);
940 
941     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
942                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
943       setOperationAction(ISD::BITREVERSE, VT, Custom);
944   }
945 
946   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
947     bool HasInt256 = Subtarget.hasInt256();
948 
949     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
950     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
951     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
952     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
953     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
954     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
955 
956     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
957       setOperationAction(ISD::FFLOOR,     VT, Legal);
958       setOperationAction(ISD::FCEIL,      VT, Legal);
959       setOperationAction(ISD::FTRUNC,     VT, Legal);
960       setOperationAction(ISD::FRINT,      VT, Legal);
961       setOperationAction(ISD::FNEARBYINT, VT, Legal);
962       setOperationAction(ISD::FNEG,       VT, Custom);
963       setOperationAction(ISD::FABS,       VT, Custom);
964     }
965 
966     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
967     // even though v8i16 is a legal type.
968     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
969     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
970     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
971 
972     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
973     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
974     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
975 
976     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
977     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
978 
979     for (MVT VT : MVT::fp_vector_valuetypes())
980       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
981 
982     for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
983       setOperationAction(ISD::SRL, VT, Custom);
984       setOperationAction(ISD::SHL, VT, Custom);
985       setOperationAction(ISD::SRA, VT, Custom);
986     }
987 
988     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
989     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
990     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
991     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
992 
993     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
994     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
995     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
996 
997     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
998     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
999     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1000     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1001     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1002     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1003     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1004     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1005     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1006     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1007     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1008     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1009     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1010 
1011     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012       setOperationAction(ISD::CTPOP,           VT, Custom);
1013       setOperationAction(ISD::CTTZ,            VT, Custom);
1014     }
1015 
1016     // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
1017     // as we end up splitting the 256-bit vectors.
1018     for (auto VT : { MVT::v32i8, MVT::v16i16 })
1019       setOperationAction(ISD::CTLZ,            VT, Custom);
1020 
1021     if (HasInt256)
1022       for (auto VT : { MVT::v8i32, MVT::v4i64 })
1023         setOperationAction(ISD::CTLZ,          VT, Custom);
1024 
1025     if (Subtarget.hasAnyFMA()) {
1026       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1027                        MVT::v2f64, MVT::v4f64 })
1028         setOperationAction(ISD::FMA, VT, Legal);
1029     }
1030 
1031     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1032       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1033       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1034     }
1035 
1036     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1037     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1038     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1039     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1040 
1041     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
1042     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
1043 
1044     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1045     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1046     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1047     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1048 
1049     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1050       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1051       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1052       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1053       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1054     }
1055 
1056     if (HasInt256) {
1057       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
1058       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
1059       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1060 
1061       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1062       // when we have a 256bit-wide blend with immediate.
1063       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1064 
1065       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1066       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1067       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1068       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1069       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1070       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1071       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1072 
1073       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1074       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1075       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1076       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1077       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1078       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1079     }
1080 
1081     // In the customized shift lowering, the legal cases in AVX2 will be
1082     // recognized.
1083     for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1084       setOperationAction(ISD::SRL, VT, Custom);
1085       setOperationAction(ISD::SHL, VT, Custom);
1086       setOperationAction(ISD::SRA, VT, Custom);
1087     }
1088 
1089     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1090                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1091       setOperationAction(ISD::MLOAD,  VT, Legal);
1092       setOperationAction(ISD::MSTORE, VT, Legal);
1093     }
1094 
1095     // Extract subvector is special because the value type
1096     // (result) is 128-bit but the source is 256-bit wide.
1097     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1098                      MVT::v4f32, MVT::v2f64 }) {
1099       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1100     }
1101 
1102     // Custom lower several nodes for 256-bit types.
1103     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1104                     MVT::v8f32, MVT::v4f64 }) {
1105       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1106       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1107       setOperationAction(ISD::VSELECT,            VT, Custom);
1108       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1109       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1110       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1111       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1112       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1113     }
1114 
1115     if (HasInt256)
1116       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1117 
1118     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1119     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1120       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
1121       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
1122       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
1123       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
1124       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1125     }
1126   }
1127 
1128   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1129     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1130     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1131     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1132     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1133 
1134     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1135     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1136     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1137 
1138     for (MVT VT : MVT::fp_vector_valuetypes())
1139       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1140 
1141     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1142       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1143       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1144       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
1145       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1146       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1147       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1148     }
1149     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1150     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1151     setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
1152     setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
1153     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1154     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1155     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1156     setOperationAction(ISD::SUB,                MVT::i1,    Custom);
1157     setOperationAction(ISD::ADD,                MVT::i1,    Custom);
1158     setOperationAction(ISD::MUL,                MVT::i1,    Custom);
1159 
1160     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1161                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1162                    MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1163       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1164       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1165       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1166       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
1167       setTruncStoreAction(VT, MaskVT, Custom);
1168     }
1169 
1170     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1171       setOperationAction(ISD::FNEG,  VT, Custom);
1172       setOperationAction(ISD::FABS,  VT, Custom);
1173       setOperationAction(ISD::FMA,   VT, Legal);
1174     }
1175 
1176     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1178     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1179     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1181     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1182     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1183     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1184     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1186     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1187     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1188     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1189     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1190     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1191     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1192 
1193     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1194     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1195     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1196     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1197     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1198     if (Subtarget.hasVLX()){
1199       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1200       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1201       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1202       setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1203       setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1204 
1205       setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1206       setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1207       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1208       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1209       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1210     } else {
1211       setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
1212       setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
1213       setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
1214       setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
1215     }
1216     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1217     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1218     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1219     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
1220     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
1221     setOperationAction(ISD::VSELECT,            MVT::v8i1,  Expand);
1222     setOperationAction(ISD::VSELECT,            MVT::v16i1, Expand);
1223     if (Subtarget.hasDQI()) {
1224       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
1225       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
1226       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
1227       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
1228       if (Subtarget.hasVLX()) {
1229         setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
1230         setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
1231         setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
1232         setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
1233         setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
1234         setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
1235         setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
1236         setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
1237       }
1238     }
1239     if (Subtarget.hasVLX()) {
1240       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
1241       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
1242       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
1243       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
1244       setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
1245       setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
1246       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
1247       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
1248       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
1249       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
1250 
1251       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1252       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
1253       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1254       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1255       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1256       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
1257       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1258       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1259       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1260       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1261       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1262     }
1263 
1264     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1265     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1266     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1267     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1268     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1269     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1270     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1271     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1272     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1273     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1274     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1275     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1276     if (Subtarget.hasDQI()) {
1277       setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
1278       setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
1279     }
1280     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1281       setOperationAction(ISD::FFLOOR,     VT, Legal);
1282       setOperationAction(ISD::FCEIL,      VT, Legal);
1283       setOperationAction(ISD::FTRUNC,     VT, Legal);
1284       setOperationAction(ISD::FRINT,      VT, Legal);
1285       setOperationAction(ISD::FNEARBYINT, VT, Legal);
1286     }
1287 
1288     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1289     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1290     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1291     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1292     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
1293 
1294     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1295     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1296 
1297     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1298 
1299     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1300     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1301     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
1302     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1303     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1304     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1305     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1306     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1307     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1308     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1309     setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
1310     setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
1311 
1312     setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
1313     setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
1314     setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
1315     setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
1316     setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
1317     setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
1318     setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
1319     setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
1320 
1321     setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
1322     setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
1323     setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
1324     setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
1325     setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
1326     setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
1327 
1328     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1329 
1330     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331       setOperationAction(ISD::SRL, VT, Custom);
1332       setOperationAction(ISD::SHL, VT, Custom);
1333       setOperationAction(ISD::SRA, VT, Custom);
1334       setOperationAction(ISD::AND, VT, Legal);
1335       setOperationAction(ISD::OR,  VT, Legal);
1336       setOperationAction(ISD::XOR, VT, Legal);
1337       setOperationAction(ISD::CTPOP, VT, Custom);
1338       setOperationAction(ISD::CTTZ, VT, Custom);
1339     }
1340 
1341     if (Subtarget.hasCDI()) {
1342       setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
1343       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1344 
1345       setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
1346       setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
1347       setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
1348       setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
1349 
1350       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
1351       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
1352 
1353       if (Subtarget.hasVLX()) {
1354         setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
1355         setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
1356         setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
1357         setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
1358       } else {
1359         setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
1360         setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
1361         setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
1362         setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
1363       }
1364 
1365       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
1366       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
1367       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
1368       setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
1369     } // Subtarget.hasCDI()
1370 
1371     if (Subtarget.hasDQI()) {
1372       if (Subtarget.hasVLX()) {
1373         setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1374         setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1375       }
1376       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1377     }
1378     // Custom lower several nodes.
1379     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1380                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1381       setOperationAction(ISD::MGATHER,  VT, Custom);
1382       setOperationAction(ISD::MSCATTER, VT, Custom);
1383     }
1384     // Extract subvector is special because the value type
1385     // (result) is 256-bit but the source is 512-bit wide.
1386     // 128-bit was made Custom under AVX1.
1387     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1388                      MVT::v8f32, MVT::v4f64 })
1389       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1390     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1391                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1392       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1393 
1394     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1395       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1396       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1397       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1398       setOperationAction(ISD::VSELECT,             VT, Legal);
1399       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1400       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1401       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1402       setOperationAction(ISD::MLOAD,               VT, Legal);
1403       setOperationAction(ISD::MSTORE,              VT, Legal);
1404       setOperationAction(ISD::MGATHER,             VT, Legal);
1405       setOperationAction(ISD::MSCATTER,            VT, Custom);
1406     }
1407     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1408       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1409     }
1410   }// has  AVX-512
1411 
1412   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1413     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1414     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1415 
1416     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1417     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1418 
1419     setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
1420     setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
1421     setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
1422     setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
1423     setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
1424     setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
1425 
1426     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1427     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1428     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1429     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
1430     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
1431     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
1432     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1433     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1434     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
1435     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
1436     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1437     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1438     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
1439     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
1440     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1441     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1442     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
1443     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
1444     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1445     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1446     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1447     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1448     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1449     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1450     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
1451     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
1452     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
1453     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1454     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1455     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1456     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1457     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
1458     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
1459     setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
1460     setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
1461     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1462     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1463     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
1464     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
1465     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
1466     setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
1467     setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
1468     setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
1469     setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
1470     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
1471 
1472     setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
1473     setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
1474     setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
1475     setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
1476     setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
1477     setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
1478     setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
1479     setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
1480 
1481     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1482     setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1483     if (Subtarget.hasVLX())
1484       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1485 
1486     LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1487     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1488       setOperationAction(ISD::MLOAD,               VT, Action);
1489       setOperationAction(ISD::MSTORE,              VT, Action);
1490     }
1491 
1492     if (Subtarget.hasCDI()) {
1493       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
1494       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
1495     }
1496 
1497     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1498       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1499       setOperationAction(ISD::VSELECT,      VT, Legal);
1500       setOperationAction(ISD::SRL,          VT, Custom);
1501       setOperationAction(ISD::SHL,          VT, Custom);
1502       setOperationAction(ISD::SRA,          VT, Custom);
1503       setOperationAction(ISD::MLOAD,        VT, Legal);
1504       setOperationAction(ISD::MSTORE,       VT, Legal);
1505       setOperationAction(ISD::CTPOP,        VT, Custom);
1506       setOperationAction(ISD::CTTZ,         VT, Custom);
1507 
1508       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
1509       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
1510       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
1511     }
1512 
1513     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1514       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1515       if (Subtarget.hasVLX()) {
1516         // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1517         setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1518         setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
1519       }
1520     }
1521   }
1522 
1523   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1524     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1525     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1526 
1527     setOperationAction(ISD::ADD,                MVT::v2i1, Expand);
1528     setOperationAction(ISD::ADD,                MVT::v4i1, Expand);
1529     setOperationAction(ISD::SUB,                MVT::v2i1, Expand);
1530     setOperationAction(ISD::SUB,                MVT::v4i1, Expand);
1531     setOperationAction(ISD::MUL,                MVT::v2i1, Expand);
1532     setOperationAction(ISD::MUL,                MVT::v4i1, Expand);
1533 
1534     setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
1535     setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
1536     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1537     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1538     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1539     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1540     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1541     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1542     setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
1543     setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
1544     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
1545     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
1546     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
1547     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
1548     setOperationAction(ISD::VSELECT,            MVT::v2i1, Expand);
1549     setOperationAction(ISD::VSELECT,            MVT::v4i1, Expand);
1550 
1551     for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
1552       setOperationAction(ISD::AND, VT, Legal);
1553       setOperationAction(ISD::OR,  VT, Legal);
1554       setOperationAction(ISD::XOR, VT, Legal);
1555     }
1556 
1557     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1558       setOperationAction(ISD::SMAX, VT, Legal);
1559       setOperationAction(ISD::UMAX, VT, Legal);
1560       setOperationAction(ISD::SMIN, VT, Legal);
1561       setOperationAction(ISD::UMIN, VT, Legal);
1562     }
1563   }
1564 
1565   // We want to custom lower some of our intrinsics.
1566   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1567   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1568   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1569   if (!Subtarget.is64Bit()) {
1570     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1571     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1572   }
1573 
1574   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1575   // handle type legalization for these operations here.
1576   //
1577   // FIXME: We really should do custom legalization for addition and
1578   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1579   // than generic legalization for 64-bit multiplication-with-overflow, though.
1580   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1581     if (VT == MVT::i64 && !Subtarget.is64Bit())
1582       continue;
1583     // Add/Sub/Mul with overflow operations are custom lowered.
1584     setOperationAction(ISD::SADDO, VT, Custom);
1585     setOperationAction(ISD::UADDO, VT, Custom);
1586     setOperationAction(ISD::SSUBO, VT, Custom);
1587     setOperationAction(ISD::USUBO, VT, Custom);
1588     setOperationAction(ISD::SMULO, VT, Custom);
1589     setOperationAction(ISD::UMULO, VT, Custom);
1590   }
1591 
1592   if (!Subtarget.is64Bit()) {
1593     // These libcalls are not available in 32-bit.
1594     setLibcallName(RTLIB::SHL_I128, nullptr);
1595     setLibcallName(RTLIB::SRL_I128, nullptr);
1596     setLibcallName(RTLIB::SRA_I128, nullptr);
1597   }
1598 
1599   // Combine sin / cos into one node or libcall if possible.
1600   if (Subtarget.hasSinCos()) {
1601     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1602     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1603     if (Subtarget.isTargetDarwin()) {
1604       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1605       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1606       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1607       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1608     }
1609   }
1610 
1611   if (Subtarget.isTargetWin64()) {
1612     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1613     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1614     setOperationAction(ISD::SREM, MVT::i128, Custom);
1615     setOperationAction(ISD::UREM, MVT::i128, Custom);
1616     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1617     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1618   }
1619 
1620   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1621   // is. We should promote the value to 64-bits to solve this.
1622   // This is what the CRT headers do - `fmodf` is an inline header
1623   // function casting to f64 and calling `fmod`.
1624   if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
1625     for (ISD::NodeType Op :
1626          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1627           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1628       if (isOperationExpand(Op, MVT::f32))
1629         setOperationAction(Op, MVT::f32, Promote);
1630 
1631   // We have target-specific dag combine patterns for the following nodes:
1632   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1633   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1634   setTargetDAGCombine(ISD::BITCAST);
1635   setTargetDAGCombine(ISD::VSELECT);
1636   setTargetDAGCombine(ISD::SELECT);
1637   setTargetDAGCombine(ISD::SHL);
1638   setTargetDAGCombine(ISD::SRA);
1639   setTargetDAGCombine(ISD::SRL);
1640   setTargetDAGCombine(ISD::OR);
1641   setTargetDAGCombine(ISD::AND);
1642   setTargetDAGCombine(ISD::ADD);
1643   setTargetDAGCombine(ISD::FADD);
1644   setTargetDAGCombine(ISD::FSUB);
1645   setTargetDAGCombine(ISD::FNEG);
1646   setTargetDAGCombine(ISD::FMA);
1647   setTargetDAGCombine(ISD::FMINNUM);
1648   setTargetDAGCombine(ISD::FMAXNUM);
1649   setTargetDAGCombine(ISD::SUB);
1650   setTargetDAGCombine(ISD::LOAD);
1651   setTargetDAGCombine(ISD::MLOAD);
1652   setTargetDAGCombine(ISD::STORE);
1653   setTargetDAGCombine(ISD::MSTORE);
1654   setTargetDAGCombine(ISD::TRUNCATE);
1655   setTargetDAGCombine(ISD::ZERO_EXTEND);
1656   setTargetDAGCombine(ISD::ANY_EXTEND);
1657   setTargetDAGCombine(ISD::SIGN_EXTEND);
1658   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659   setTargetDAGCombine(ISD::SINT_TO_FP);
1660   setTargetDAGCombine(ISD::UINT_TO_FP);
1661   setTargetDAGCombine(ISD::SETCC);
1662   setTargetDAGCombine(ISD::MUL);
1663   setTargetDAGCombine(ISD::XOR);
1664   setTargetDAGCombine(ISD::MSCATTER);
1665   setTargetDAGCombine(ISD::MGATHER);
1666 
1667   computeRegisterProperties(Subtarget.getRegisterInfo());
1668 
1669   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1670   MaxStoresPerMemsetOptSize = 8;
1671   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1672   MaxStoresPerMemcpyOptSize = 4;
1673   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1674   MaxStoresPerMemmoveOptSize = 4;
1675   setPrefLoopAlignment(4); // 2^4 bytes.
1676 
1677   // An out-of-order CPU can speculatively execute past a predictable branch,
1678   // but a conditional move could be stalled by an expensive earlier operation.
1679   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1680   EnableExtLdPromotion = true;
1681   setPrefFunctionAlignment(4); // 2^4 bytes.
1682 
1683   verifyIntrinsicTables();
1684 }
1685 
1686 // This has so far only been implemented for 64-bit MachO.
useLoadStackGuardNode() const1687 bool X86TargetLowering::useLoadStackGuardNode() const {
1688   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1689 }
1690 
1691 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const1692 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1693   if (ExperimentalVectorWideningLegalization &&
1694       VT.getVectorNumElements() != 1 &&
1695       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1696     return TypeWidenVector;
1697 
1698   return TargetLoweringBase::getPreferredVectorAction(VT);
1699 }
1700 
getSetCCResultType(const DataLayout & DL,LLVMContext & Context,EVT VT) const1701 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1702                                           LLVMContext& Context,
1703                                           EVT VT) const {
1704   if (!VT.isVector())
1705     return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1706 
1707   if (VT.isSimple()) {
1708     MVT VVT = VT.getSimpleVT();
1709     const unsigned NumElts = VVT.getVectorNumElements();
1710     MVT EltVT = VVT.getVectorElementType();
1711     if (VVT.is512BitVector()) {
1712       if (Subtarget.hasAVX512())
1713         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1714             EltVT == MVT::f32 || EltVT == MVT::f64)
1715           switch(NumElts) {
1716           case  8: return MVT::v8i1;
1717           case 16: return MVT::v16i1;
1718         }
1719       if (Subtarget.hasBWI())
1720         if (EltVT == MVT::i8 || EltVT == MVT::i16)
1721           switch(NumElts) {
1722           case 32: return MVT::v32i1;
1723           case 64: return MVT::v64i1;
1724         }
1725     }
1726 
1727     if (Subtarget.hasBWI() && Subtarget.hasVLX())
1728       return MVT::getVectorVT(MVT::i1, NumElts);
1729 
1730     if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1731       EVT LegalVT = getTypeToTransformTo(Context, VT);
1732       EltVT = LegalVT.getVectorElementType().getSimpleVT();
1733     }
1734 
1735     if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1736       switch(NumElts) {
1737       case 2: return MVT::v2i1;
1738       case 4: return MVT::v4i1;
1739       case 8: return MVT::v8i1;
1740       }
1741   }
1742 
1743   return VT.changeVectorElementTypeToInteger();
1744 }
1745 
1746 /// Helper for getByValTypeAlignment to determine
1747 /// the desired ByVal argument alignment.
getMaxByValAlign(Type * Ty,unsigned & MaxAlign)1748 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1749   if (MaxAlign == 16)
1750     return;
1751   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1752     if (VTy->getBitWidth() == 128)
1753       MaxAlign = 16;
1754   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1755     unsigned EltAlign = 0;
1756     getMaxByValAlign(ATy->getElementType(), EltAlign);
1757     if (EltAlign > MaxAlign)
1758       MaxAlign = EltAlign;
1759   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1760     for (auto *EltTy : STy->elements()) {
1761       unsigned EltAlign = 0;
1762       getMaxByValAlign(EltTy, EltAlign);
1763       if (EltAlign > MaxAlign)
1764         MaxAlign = EltAlign;
1765       if (MaxAlign == 16)
1766         break;
1767     }
1768   }
1769 }
1770 
1771 /// Return the desired alignment for ByVal aggregate
1772 /// function arguments in the caller parameter area. For X86, aggregates
1773 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1774 /// are at 4-byte boundaries.
getByValTypeAlignment(Type * Ty,const DataLayout & DL) const1775 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1776                                                   const DataLayout &DL) const {
1777   if (Subtarget.is64Bit()) {
1778     // Max of 8 and alignment of type.
1779     unsigned TyAlign = DL.getABITypeAlignment(Ty);
1780     if (TyAlign > 8)
1781       return TyAlign;
1782     return 8;
1783   }
1784 
1785   unsigned Align = 4;
1786   if (Subtarget.hasSSE1())
1787     getMaxByValAlign(Ty, Align);
1788   return Align;
1789 }
1790 
1791 /// Returns the target specific optimal type for load
1792 /// and store operations as a result of memset, memcpy, and memmove
1793 /// lowering. If DstAlign is zero that means it's safe to destination
1794 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1795 /// means there isn't a need to check it against alignment requirement,
1796 /// probably because the source does not need to be loaded. If 'IsMemset' is
1797 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1798 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1799 /// source is constant so it does not need to be loaded.
1800 /// It returns EVT::Other if the type should be determined using generic
1801 /// target-independent logic.
1802 EVT
getOptimalMemOpType(uint64_t Size,unsigned DstAlign,unsigned SrcAlign,bool IsMemset,bool ZeroMemset,bool MemcpyStrSrc,MachineFunction & MF) const1803 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1804                                        unsigned DstAlign, unsigned SrcAlign,
1805                                        bool IsMemset, bool ZeroMemset,
1806                                        bool MemcpyStrSrc,
1807                                        MachineFunction &MF) const {
1808   const Function *F = MF.getFunction();
1809   if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1810     if (Size >= 16 &&
1811         (!Subtarget.isUnalignedMem16Slow() ||
1812          ((DstAlign == 0 || DstAlign >= 16) &&
1813           (SrcAlign == 0 || SrcAlign >= 16)))) {
1814       // FIXME: Check if unaligned 32-byte accesses are slow.
1815       if (Size >= 32 && Subtarget.hasAVX()) {
1816         // Although this isn't a well-supported type for AVX1, we'll let
1817         // legalization and shuffle lowering produce the optimal codegen. If we
1818         // choose an optimal type with a vector element larger than a byte,
1819         // getMemsetStores() may create an intermediate splat (using an integer
1820         // multiply) before we splat as a vector.
1821         return MVT::v32i8;
1822       }
1823       if (Subtarget.hasSSE2())
1824         return MVT::v16i8;
1825       // TODO: Can SSE1 handle a byte vector?
1826       if (Subtarget.hasSSE1())
1827         return MVT::v4f32;
1828     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1829                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1830       // Do not use f64 to lower memcpy if source is string constant. It's
1831       // better to use i32 to avoid the loads.
1832       // Also, do not use f64 to lower memset unless this is a memset of zeros.
1833       // The gymnastics of splatting a byte value into an XMM register and then
1834       // only using 8-byte stores (because this is a CPU with slow unaligned
1835       // 16-byte accesses) makes that a loser.
1836       return MVT::f64;
1837     }
1838   }
1839   // This is a compromise. If we reach here, unaligned accesses may be slow on
1840   // this target. However, creating smaller, aligned accesses could be even
1841   // slower and would certainly be a lot more code.
1842   if (Subtarget.is64Bit() && Size >= 8)
1843     return MVT::i64;
1844   return MVT::i32;
1845 }
1846 
isSafeMemOpType(MVT VT) const1847 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1848   if (VT == MVT::f32)
1849     return X86ScalarSSEf32;
1850   else if (VT == MVT::f64)
1851     return X86ScalarSSEf64;
1852   return true;
1853 }
1854 
1855 bool
allowsMisalignedMemoryAccesses(EVT VT,unsigned,unsigned,bool * Fast) const1856 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1857                                                   unsigned,
1858                                                   unsigned,
1859                                                   bool *Fast) const {
1860   if (Fast) {
1861     switch (VT.getSizeInBits()) {
1862     default:
1863       // 8-byte and under are always assumed to be fast.
1864       *Fast = true;
1865       break;
1866     case 128:
1867       *Fast = !Subtarget.isUnalignedMem16Slow();
1868       break;
1869     case 256:
1870       *Fast = !Subtarget.isUnalignedMem32Slow();
1871       break;
1872     // TODO: What about AVX-512 (512-bit) accesses?
1873     }
1874   }
1875   // Misaligned accesses of any size are always allowed.
1876   return true;
1877 }
1878 
1879 /// Return the entry encoding for a jump table in the
1880 /// current function.  The returned value is a member of the
1881 /// MachineJumpTableInfo::JTEntryKind enum.
getJumpTableEncoding() const1882 unsigned X86TargetLowering::getJumpTableEncoding() const {
1883   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1884   // symbol.
1885   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1886     return MachineJumpTableInfo::EK_Custom32;
1887 
1888   // Otherwise, use the normal jump table encoding heuristics.
1889   return TargetLowering::getJumpTableEncoding();
1890 }
1891 
useSoftFloat() const1892 bool X86TargetLowering::useSoftFloat() const {
1893   return Subtarget.useSoftFloat();
1894 }
1895 
1896 const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo * MJTI,const MachineBasicBlock * MBB,unsigned uid,MCContext & Ctx) const1897 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1898                                              const MachineBasicBlock *MBB,
1899                                              unsigned uid,MCContext &Ctx) const{
1900   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1901   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1902   // entries.
1903   return MCSymbolRefExpr::create(MBB->getSymbol(),
1904                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1905 }
1906 
1907 /// Returns relocation base for the given PIC jumptable.
getPICJumpTableRelocBase(SDValue Table,SelectionDAG & DAG) const1908 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1909                                                     SelectionDAG &DAG) const {
1910   if (!Subtarget.is64Bit())
1911     // This doesn't have SDLoc associated with it, but is not really the
1912     // same as a Register.
1913     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1914                        getPointerTy(DAG.getDataLayout()));
1915   return Table;
1916 }
1917 
1918 /// This returns the relocation base for the given PIC jumptable,
1919 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1920 const MCExpr *X86TargetLowering::
getPICJumpTableRelocBaseExpr(const MachineFunction * MF,unsigned JTI,MCContext & Ctx) const1921 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1922                              MCContext &Ctx) const {
1923   // X86-64 uses RIP relative addressing based on the jump table label.
1924   if (Subtarget.isPICStyleRIPRel())
1925     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1926 
1927   // Otherwise, the reference is relative to the PIC base.
1928   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1929 }
1930 
1931 std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo * TRI,MVT VT) const1932 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1933                                            MVT VT) const {
1934   const TargetRegisterClass *RRC = nullptr;
1935   uint8_t Cost = 1;
1936   switch (VT.SimpleTy) {
1937   default:
1938     return TargetLowering::findRepresentativeClass(TRI, VT);
1939   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1940     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1941     break;
1942   case MVT::x86mmx:
1943     RRC = &X86::VR64RegClass;
1944     break;
1945   case MVT::f32: case MVT::f64:
1946   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1947   case MVT::v4f32: case MVT::v2f64:
1948   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1949   case MVT::v4f64:
1950     RRC = &X86::VR128RegClass;
1951     break;
1952   }
1953   return std::make_pair(RRC, Cost);
1954 }
1955 
getAddressSpace() const1956 unsigned X86TargetLowering::getAddressSpace() const {
1957   if (Subtarget.is64Bit())
1958     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1959   return 256;
1960 }
1961 
getIRStackGuard(IRBuilder<> & IRB) const1962 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1963   // glibc has a special slot for the stack guard in tcbhead_t, use it instead
1964   // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
1965   if (!Subtarget.isTargetGlibc())
1966     return TargetLowering::getIRStackGuard(IRB);
1967 
1968   // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1969   // %gs:0x14 on i386
1970   unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
1971   unsigned AddressSpace = getAddressSpace();
1972   return ConstantExpr::getIntToPtr(
1973       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1974       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1975 }
1976 
insertSSPDeclarations(Module & M) const1977 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
1978   // MSVC CRT provides functionalities for stack protection.
1979   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
1980     // MSVC CRT has a global variable holding security cookie.
1981     M.getOrInsertGlobal("__security_cookie",
1982                         Type::getInt8PtrTy(M.getContext()));
1983 
1984     // MSVC CRT has a function to validate security cookie.
1985     auto *SecurityCheckCookie = cast<Function>(
1986         M.getOrInsertFunction("__security_check_cookie",
1987                               Type::getVoidTy(M.getContext()),
1988                               Type::getInt8PtrTy(M.getContext()), nullptr));
1989     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
1990     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
1991     return;
1992   }
1993   // glibc has a special slot for the stack guard.
1994   if (Subtarget.isTargetGlibc())
1995     return;
1996   TargetLowering::insertSSPDeclarations(M);
1997 }
1998 
getSDagStackGuard(const Module & M) const1999 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2000   // MSVC CRT has a global variable holding security cookie.
2001   if (Subtarget.getTargetTriple().isOSMSVCRT())
2002     return M.getGlobalVariable("__security_cookie");
2003   return TargetLowering::getSDagStackGuard(M);
2004 }
2005 
getSSPStackGuardCheck(const Module & M) const2006 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2007   // MSVC CRT has a function to validate security cookie.
2008   if (Subtarget.getTargetTriple().isOSMSVCRT())
2009     return M.getFunction("__security_check_cookie");
2010   return TargetLowering::getSSPStackGuardCheck(M);
2011 }
2012 
getSafeStackPointerLocation(IRBuilder<> & IRB) const2013 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2014   if (!Subtarget.isTargetAndroid())
2015     return TargetLowering::getSafeStackPointerLocation(IRB);
2016 
2017   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2018   // definition of TLS_SLOT_SAFESTACK in
2019   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2020   unsigned AddressSpace, Offset;
2021 
2022   // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2023   // %gs:0x24 on i386
2024   Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2025   AddressSpace = getAddressSpace();
2026   return ConstantExpr::getIntToPtr(
2027       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2028       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2029 }
2030 
isNoopAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const2031 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2032                                             unsigned DestAS) const {
2033   assert(SrcAS != DestAS && "Expected different address spaces!");
2034 
2035   return SrcAS < 256 && DestAS < 256;
2036 }
2037 
2038 //===----------------------------------------------------------------------===//
2039 //               Return Value Calling Convention Implementation
2040 //===----------------------------------------------------------------------===//
2041 
2042 #include "X86GenCallingConv.inc"
2043 
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context) const2044 bool X86TargetLowering::CanLowerReturn(
2045     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2046     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2047   SmallVector<CCValAssign, 16> RVLocs;
2048   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2049   return CCInfo.CheckReturn(Outs, RetCC_X86);
2050 }
2051 
getScratchRegisters(CallingConv::ID) const2052 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2053   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2054   return ScratchRegs;
2055 }
2056 
2057 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & dl,SelectionDAG & DAG) const2058 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2059                                bool isVarArg,
2060                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2061                                const SmallVectorImpl<SDValue> &OutVals,
2062                                const SDLoc &dl, SelectionDAG &DAG) const {
2063   MachineFunction &MF = DAG.getMachineFunction();
2064   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2065 
2066   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2067     report_fatal_error("X86 interrupts may not return any value");
2068 
2069   SmallVector<CCValAssign, 16> RVLocs;
2070   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2071   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2072 
2073   SDValue Flag;
2074   SmallVector<SDValue, 6> RetOps;
2075   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2076   // Operand #1 = Bytes To Pop
2077   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2078                    MVT::i32));
2079 
2080   // Copy the result values into the output registers.
2081   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2082     CCValAssign &VA = RVLocs[i];
2083     assert(VA.isRegLoc() && "Can only return in registers!");
2084     SDValue ValToCopy = OutVals[i];
2085     EVT ValVT = ValToCopy.getValueType();
2086 
2087     // Promote values to the appropriate types.
2088     if (VA.getLocInfo() == CCValAssign::SExt)
2089       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2090     else if (VA.getLocInfo() == CCValAssign::ZExt)
2091       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2092     else if (VA.getLocInfo() == CCValAssign::AExt) {
2093       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2094         ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2095       else
2096         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2097     }
2098     else if (VA.getLocInfo() == CCValAssign::BCvt)
2099       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2100 
2101     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2102            "Unexpected FP-extend for return value.");
2103 
2104     // If this is x86-64, and we disabled SSE, we can't return FP values,
2105     // or SSE or MMX vectors.
2106     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2107          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2108           (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2109       report_fatal_error("SSE register return with SSE disabled");
2110     }
2111     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2112     // llvm-gcc has never done it right and no one has noticed, so this
2113     // should be OK for now.
2114     if (ValVT == MVT::f64 &&
2115         (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2116       report_fatal_error("SSE2 register return with SSE2 disabled");
2117 
2118     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2119     // the RET instruction and handled by the FP Stackifier.
2120     if (VA.getLocReg() == X86::FP0 ||
2121         VA.getLocReg() == X86::FP1) {
2122       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2123       // change the value to the FP stack register class.
2124       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2125         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2126       RetOps.push_back(ValToCopy);
2127       // Don't emit a copytoreg.
2128       continue;
2129     }
2130 
2131     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2132     // which is returned in RAX / RDX.
2133     if (Subtarget.is64Bit()) {
2134       if (ValVT == MVT::x86mmx) {
2135         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2136           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2137           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2138                                   ValToCopy);
2139           // If we don't have SSE2 available, convert to v4f32 so the generated
2140           // register is legal.
2141           if (!Subtarget.hasSSE2())
2142             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2143         }
2144       }
2145     }
2146 
2147     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2148     Flag = Chain.getValue(1);
2149     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2150   }
2151 
2152   // Swift calling convention does not require we copy the sret argument
2153   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2154 
2155   // All x86 ABIs require that for returning structs by value we copy
2156   // the sret argument into %rax/%eax (depending on ABI) for the return.
2157   // We saved the argument into a virtual register in the entry block,
2158   // so now we copy the value out and into %rax/%eax.
2159   //
2160   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2161   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2162   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2163   // either case FuncInfo->setSRetReturnReg() will have been called.
2164   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2165     // When we have both sret and another return value, we should use the
2166     // original Chain stored in RetOps[0], instead of the current Chain updated
2167     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2168 
2169     // For the case of sret and another return value, we have
2170     //   Chain_0 at the function entry
2171     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2172     // If we use Chain_1 in getCopyFromReg, we will have
2173     //   Val = getCopyFromReg(Chain_1)
2174     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2175 
2176     // getCopyToReg(Chain_0) will be glued together with
2177     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2178     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2179     //   Data dependency from Unit B to Unit A due to usage of Val in
2180     //     getCopyToReg(Chain_1, Val)
2181     //   Chain dependency from Unit A to Unit B
2182 
2183     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2184     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2185                                      getPointerTy(MF.getDataLayout()));
2186 
2187     unsigned RetValReg
2188         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2189           X86::RAX : X86::EAX;
2190     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2191     Flag = Chain.getValue(1);
2192 
2193     // RAX/EAX now acts like a return value.
2194     RetOps.push_back(
2195         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2196   }
2197 
2198   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2199   const MCPhysReg *I =
2200       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2201   if (I) {
2202     for (; *I; ++I) {
2203       if (X86::GR64RegClass.contains(*I))
2204         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2205       else
2206         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2207     }
2208   }
2209 
2210   RetOps[0] = Chain;  // Update chain.
2211 
2212   // Add the flag if we have it.
2213   if (Flag.getNode())
2214     RetOps.push_back(Flag);
2215 
2216   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2217   if (CallConv == CallingConv::X86_INTR)
2218     opcode = X86ISD::IRET;
2219   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2220 }
2221 
isUsedByReturnOnly(SDNode * N,SDValue & Chain) const2222 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2223   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2224     return false;
2225 
2226   SDValue TCChain = Chain;
2227   SDNode *Copy = *N->use_begin();
2228   if (Copy->getOpcode() == ISD::CopyToReg) {
2229     // If the copy has a glue operand, we conservatively assume it isn't safe to
2230     // perform a tail call.
2231     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2232       return false;
2233     TCChain = Copy->getOperand(0);
2234   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2235     return false;
2236 
2237   bool HasRet = false;
2238   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2239        UI != UE; ++UI) {
2240     if (UI->getOpcode() != X86ISD::RET_FLAG)
2241       return false;
2242     // If we are returning more than one value, we can definitely
2243     // not make a tail call see PR19530
2244     if (UI->getNumOperands() > 4)
2245       return false;
2246     if (UI->getNumOperands() == 4 &&
2247         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2248       return false;
2249     HasRet = true;
2250   }
2251 
2252   if (!HasRet)
2253     return false;
2254 
2255   Chain = TCChain;
2256   return true;
2257 }
2258 
getTypeForExtReturn(LLVMContext & Context,EVT VT,ISD::NodeType ExtendKind) const2259 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2260                                            ISD::NodeType ExtendKind) const {
2261   MVT ReturnMVT = MVT::i32;
2262 
2263   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2264   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2265     // The ABI does not require i1, i8 or i16 to be extended.
2266     //
2267     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2268     // always extending i8/i16 return values, so keep doing that for now.
2269     // (PR26665).
2270     ReturnMVT = MVT::i8;
2271   }
2272 
2273   EVT MinVT = getRegisterType(Context, ReturnMVT);
2274   return VT.bitsLT(MinVT) ? MinVT : VT;
2275 }
2276 
2277 /// Lower the result values of a call into the
2278 /// appropriate copies out of appropriate physical registers.
2279 ///
LowerCallResult(SDValue Chain,SDValue InFlag,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2280 SDValue X86TargetLowering::LowerCallResult(
2281     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2282     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2283     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2284 
2285   // Assign locations to each value returned by this call.
2286   SmallVector<CCValAssign, 16> RVLocs;
2287   bool Is64Bit = Subtarget.is64Bit();
2288   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2289                  *DAG.getContext());
2290   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2291 
2292   // Copy all of the result registers out of their specified physreg.
2293   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2294     CCValAssign &VA = RVLocs[i];
2295     EVT CopyVT = VA.getLocVT();
2296 
2297     // If this is x86-64, and we disabled SSE, we can't return FP values
2298     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2299         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2300       report_fatal_error("SSE register return with SSE disabled");
2301     }
2302 
2303     // If we prefer to use the value in xmm registers, copy it out as f80 and
2304     // use a truncate to move it from fp stack reg to xmm reg.
2305     bool RoundAfterCopy = false;
2306     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2307         isScalarFPTypeInSSEReg(VA.getValVT())) {
2308       if (!Subtarget.hasX87())
2309         report_fatal_error("X87 register return with X87 disabled");
2310       CopyVT = MVT::f80;
2311       RoundAfterCopy = (CopyVT != VA.getLocVT());
2312     }
2313 
2314     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2315                                CopyVT, InFlag).getValue(1);
2316     SDValue Val = Chain.getValue(0);
2317 
2318     if (RoundAfterCopy)
2319       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2320                         // This truncation won't change the value.
2321                         DAG.getIntPtrConstant(1, dl));
2322 
2323     if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
2324       Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2325 
2326     InFlag = Chain.getValue(2);
2327     InVals.push_back(Val);
2328   }
2329 
2330   return Chain;
2331 }
2332 
2333 //===----------------------------------------------------------------------===//
2334 //                C & StdCall & Fast Calling Convention implementation
2335 //===----------------------------------------------------------------------===//
2336 //  StdCall calling convention seems to be standard for many Windows' API
2337 //  routines and around. It differs from C calling convention just a little:
2338 //  callee should clean up the stack, not caller. Symbols should be also
2339 //  decorated in some fancy way :) It doesn't support any vector arguments.
2340 //  For info on fast calling convention see Fast Calling Convention (tail call)
2341 //  implementation LowerX86_32FastCCCallTo.
2342 
2343 /// CallIsStructReturn - Determines whether a call uses struct return
2344 /// semantics.
2345 enum StructReturnType {
2346   NotStructReturn,
2347   RegStructReturn,
2348   StackStructReturn
2349 };
2350 static StructReturnType
callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> & Outs,bool IsMCU)2351 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2352   if (Outs.empty())
2353     return NotStructReturn;
2354 
2355   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2356   if (!Flags.isSRet())
2357     return NotStructReturn;
2358   if (Flags.isInReg() || IsMCU)
2359     return RegStructReturn;
2360   return StackStructReturn;
2361 }
2362 
2363 /// Determines whether a function uses struct return semantics.
2364 static StructReturnType
argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> & Ins,bool IsMCU)2365 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2366   if (Ins.empty())
2367     return NotStructReturn;
2368 
2369   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2370   if (!Flags.isSRet())
2371     return NotStructReturn;
2372   if (Flags.isInReg() || IsMCU)
2373     return RegStructReturn;
2374   return StackStructReturn;
2375 }
2376 
2377 /// Make a copy of an aggregate at address specified by "Src" to address
2378 /// "Dst" with size and alignment information specified by the specific
2379 /// parameter attribute. The copy will be passed as a byval function parameter.
CreateCopyOfByValArgument(SDValue Src,SDValue Dst,SDValue Chain,ISD::ArgFlagsTy Flags,SelectionDAG & DAG,const SDLoc & dl)2380 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2381                                          SDValue Chain, ISD::ArgFlagsTy Flags,
2382                                          SelectionDAG &DAG, const SDLoc &dl) {
2383   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2384 
2385   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2386                        /*isVolatile*/false, /*AlwaysInline=*/true,
2387                        /*isTailCall*/false,
2388                        MachinePointerInfo(), MachinePointerInfo());
2389 }
2390 
2391 /// Return true if the calling convention is one that we can guarantee TCO for.
canGuaranteeTCO(CallingConv::ID CC)2392 static bool canGuaranteeTCO(CallingConv::ID CC) {
2393   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2394           CC == CallingConv::HiPE || CC == CallingConv::HHVM);
2395 }
2396 
2397 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)2398 static bool mayTailCallThisCC(CallingConv::ID CC) {
2399   switch (CC) {
2400   // C calling conventions:
2401   case CallingConv::C:
2402   case CallingConv::X86_64_Win64:
2403   case CallingConv::X86_64_SysV:
2404   // Callee pop conventions:
2405   case CallingConv::X86_ThisCall:
2406   case CallingConv::X86_StdCall:
2407   case CallingConv::X86_VectorCall:
2408   case CallingConv::X86_FastCall:
2409     return true;
2410   default:
2411     return canGuaranteeTCO(CC);
2412   }
2413 }
2414 
2415 /// Return true if the function is being made into a tailcall target by
2416 /// changing its ABI.
shouldGuaranteeTCO(CallingConv::ID CC,bool GuaranteedTailCallOpt)2417 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2418   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2419 }
2420 
mayBeEmittedAsTailCall(CallInst * CI) const2421 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2422   auto Attr =
2423       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2424   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2425     return false;
2426 
2427   CallSite CS(CI);
2428   CallingConv::ID CalleeCC = CS.getCallingConv();
2429   if (!mayTailCallThisCC(CalleeCC))
2430     return false;
2431 
2432   return true;
2433 }
2434 
2435 SDValue
LowerMemArgument(SDValue Chain,CallingConv::ID CallConv,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,MachineFrameInfo * MFI,unsigned i) const2436 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2437                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2438                                     const SDLoc &dl, SelectionDAG &DAG,
2439                                     const CCValAssign &VA,
2440                                     MachineFrameInfo *MFI, unsigned i) const {
2441   // Create the nodes corresponding to a load from this parameter slot.
2442   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2443   bool AlwaysUseMutable = shouldGuaranteeTCO(
2444       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2445   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2446   EVT ValVT;
2447 
2448   // If value is passed by pointer we have address passed instead of the value
2449   // itself.
2450   bool ExtendedInMem = VA.isExtInLoc() &&
2451     VA.getValVT().getScalarType() == MVT::i1;
2452 
2453   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2454     ValVT = VA.getLocVT();
2455   else
2456     ValVT = VA.getValVT();
2457 
2458   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2459   // taken by a return address.
2460   int Offset = 0;
2461   if (CallConv == CallingConv::X86_INTR) {
2462     const X86Subtarget& Subtarget =
2463         static_cast<const X86Subtarget&>(DAG.getSubtarget());
2464     // X86 interrupts may take one or two arguments.
2465     // On the stack there will be no return address as in regular call.
2466     // Offset of last argument need to be set to -4/-8 bytes.
2467     // Where offset of the first argument out of two, should be set to 0 bytes.
2468     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2469   }
2470 
2471   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2472   // changed with more analysis.
2473   // In case of tail call optimization mark all arguments mutable. Since they
2474   // could be overwritten by lowering of arguments in case of a tail call.
2475   if (Flags.isByVal()) {
2476     unsigned Bytes = Flags.getByValSize();
2477     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2478     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2479     // Adjust SP offset of interrupt parameter.
2480     if (CallConv == CallingConv::X86_INTR) {
2481       MFI->setObjectOffset(FI, Offset);
2482     }
2483     return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2484   } else {
2485     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2486                                     VA.getLocMemOffset(), isImmutable);
2487 
2488     // Set SExt or ZExt flag.
2489     if (VA.getLocInfo() == CCValAssign::ZExt) {
2490       MFI->setObjectZExt(FI, true);
2491     } else if (VA.getLocInfo() == CCValAssign::SExt) {
2492       MFI->setObjectSExt(FI, true);
2493     }
2494 
2495     // Adjust SP offset of interrupt parameter.
2496     if (CallConv == CallingConv::X86_INTR) {
2497       MFI->setObjectOffset(FI, Offset);
2498     }
2499 
2500     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2501     SDValue Val = DAG.getLoad(
2502         ValVT, dl, Chain, FIN,
2503         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
2504         false, false, 0);
2505     return ExtendedInMem ?
2506       DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2507   }
2508 }
2509 
2510 // FIXME: Get this from tablegen.
get64BitArgumentGPRs(CallingConv::ID CallConv,const X86Subtarget & Subtarget)2511 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2512                                                 const X86Subtarget &Subtarget) {
2513   assert(Subtarget.is64Bit());
2514 
2515   if (Subtarget.isCallingConvWin64(CallConv)) {
2516     static const MCPhysReg GPR64ArgRegsWin64[] = {
2517       X86::RCX, X86::RDX, X86::R8,  X86::R9
2518     };
2519     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2520   }
2521 
2522   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2523     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2524   };
2525   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2526 }
2527 
2528 // FIXME: Get this from tablegen.
get64BitArgumentXMMs(MachineFunction & MF,CallingConv::ID CallConv,const X86Subtarget & Subtarget)2529 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2530                                                 CallingConv::ID CallConv,
2531                                                 const X86Subtarget &Subtarget) {
2532   assert(Subtarget.is64Bit());
2533   if (Subtarget.isCallingConvWin64(CallConv)) {
2534     // The XMM registers which might contain var arg parameters are shadowed
2535     // in their paired GPR.  So we only need to save the GPR to their home
2536     // slots.
2537     // TODO: __vectorcall will change this.
2538     return None;
2539   }
2540 
2541   const Function *Fn = MF.getFunction();
2542   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2543   bool isSoftFloat = Subtarget.useSoftFloat();
2544   assert(!(isSoftFloat && NoImplicitFloatOps) &&
2545          "SSE register cannot be used when SSE is disabled!");
2546   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2547     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2548     // registers.
2549     return None;
2550 
2551   static const MCPhysReg XMMArgRegs64Bit[] = {
2552     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2553     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2554   };
2555   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2556 }
2557 
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & dl,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2558 SDValue X86TargetLowering::LowerFormalArguments(
2559     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2560     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2561     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2562   MachineFunction &MF = DAG.getMachineFunction();
2563   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2564   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2565 
2566   const Function *Fn = MF.getFunction();
2567   if (Fn->hasExternalLinkage() &&
2568       Subtarget.isTargetCygMing() &&
2569       Fn->getName() == "main")
2570     FuncInfo->setForceFramePointer(true);
2571 
2572   MachineFrameInfo *MFI = MF.getFrameInfo();
2573   bool Is64Bit = Subtarget.is64Bit();
2574   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2575 
2576   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2577          "Var args not supported with calling convention fastcc, ghc or hipe");
2578 
2579   if (CallConv == CallingConv::X86_INTR) {
2580     bool isLegal = Ins.size() == 1 ||
2581                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2582                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
2583     if (!isLegal)
2584       report_fatal_error("X86 interrupts may take one or two arguments");
2585   }
2586 
2587   // Assign locations to all of the incoming arguments.
2588   SmallVector<CCValAssign, 16> ArgLocs;
2589   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2590 
2591   // Allocate shadow area for Win64
2592   if (IsWin64)
2593     CCInfo.AllocateStack(32, 8);
2594 
2595   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2596 
2597   unsigned LastVal = ~0U;
2598   SDValue ArgValue;
2599   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2600     CCValAssign &VA = ArgLocs[i];
2601     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2602     // places.
2603     assert(VA.getValNo() != LastVal &&
2604            "Don't support value assigned to multiple locs yet");
2605     (void)LastVal;
2606     LastVal = VA.getValNo();
2607 
2608     if (VA.isRegLoc()) {
2609       EVT RegVT = VA.getLocVT();
2610       const TargetRegisterClass *RC;
2611       if (RegVT == MVT::i32)
2612         RC = &X86::GR32RegClass;
2613       else if (Is64Bit && RegVT == MVT::i64)
2614         RC = &X86::GR64RegClass;
2615       else if (RegVT == MVT::f32)
2616         RC = &X86::FR32RegClass;
2617       else if (RegVT == MVT::f64)
2618         RC = &X86::FR64RegClass;
2619       else if (RegVT == MVT::f128)
2620         RC = &X86::FR128RegClass;
2621       else if (RegVT.is512BitVector())
2622         RC = &X86::VR512RegClass;
2623       else if (RegVT.is256BitVector())
2624         RC = &X86::VR256RegClass;
2625       else if (RegVT.is128BitVector())
2626         RC = &X86::VR128RegClass;
2627       else if (RegVT == MVT::x86mmx)
2628         RC = &X86::VR64RegClass;
2629       else if (RegVT == MVT::i1)
2630         RC = &X86::VK1RegClass;
2631       else if (RegVT == MVT::v8i1)
2632         RC = &X86::VK8RegClass;
2633       else if (RegVT == MVT::v16i1)
2634         RC = &X86::VK16RegClass;
2635       else if (RegVT == MVT::v32i1)
2636         RC = &X86::VK32RegClass;
2637       else if (RegVT == MVT::v64i1)
2638         RC = &X86::VK64RegClass;
2639       else
2640         llvm_unreachable("Unknown argument type!");
2641 
2642       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2643       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2644 
2645       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2646       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2647       // right size.
2648       if (VA.getLocInfo() == CCValAssign::SExt)
2649         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2650                                DAG.getValueType(VA.getValVT()));
2651       else if (VA.getLocInfo() == CCValAssign::ZExt)
2652         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2653                                DAG.getValueType(VA.getValVT()));
2654       else if (VA.getLocInfo() == CCValAssign::BCvt)
2655         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2656 
2657       if (VA.isExtInLoc()) {
2658         // Handle MMX values passed in XMM regs.
2659         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2660           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2661         else
2662           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2663       }
2664     } else {
2665       assert(VA.isMemLoc());
2666       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2667     }
2668 
2669     // If value is passed via pointer - do a load.
2670     if (VA.getLocInfo() == CCValAssign::Indirect)
2671       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2672                              MachinePointerInfo(), false, false, false, 0);
2673 
2674     InVals.push_back(ArgValue);
2675   }
2676 
2677   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2678     // Swift calling convention does not require we copy the sret argument
2679     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2680     if (CallConv == CallingConv::Swift)
2681       continue;
2682 
2683     // All x86 ABIs require that for returning structs by value we copy the
2684     // sret argument into %rax/%eax (depending on ABI) for the return. Save
2685     // the argument into a virtual register so that we can access it from the
2686     // return points.
2687     if (Ins[i].Flags.isSRet()) {
2688       unsigned Reg = FuncInfo->getSRetReturnReg();
2689       if (!Reg) {
2690         MVT PtrTy = getPointerTy(DAG.getDataLayout());
2691         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2692         FuncInfo->setSRetReturnReg(Reg);
2693       }
2694       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2695       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2696       break;
2697     }
2698   }
2699 
2700   unsigned StackSize = CCInfo.getNextStackOffset();
2701   // Align stack specially for tail calls.
2702   if (shouldGuaranteeTCO(CallConv,
2703                          MF.getTarget().Options.GuaranteedTailCallOpt))
2704     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2705 
2706   // If the function takes variable number of arguments, make a frame index for
2707   // the start of the first vararg value... for expansion of llvm.va_start. We
2708   // can skip this if there are no va_start calls.
2709   if (MFI->hasVAStart() &&
2710       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2711                    CallConv != CallingConv::X86_ThisCall))) {
2712     FuncInfo->setVarArgsFrameIndex(
2713         MFI->CreateFixedObject(1, StackSize, true));
2714   }
2715 
2716   // Figure out if XMM registers are in use.
2717   assert(!(Subtarget.useSoftFloat() &&
2718            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2719          "SSE register cannot be used when SSE is disabled!");
2720 
2721   // 64-bit calling conventions support varargs and register parameters, so we
2722   // have to do extra work to spill them in the prologue.
2723   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2724     // Find the first unallocated argument registers.
2725     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2726     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2727     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2728     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2729     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2730            "SSE register cannot be used when SSE is disabled!");
2731 
2732     // Gather all the live in physical registers.
2733     SmallVector<SDValue, 6> LiveGPRs;
2734     SmallVector<SDValue, 8> LiveXMMRegs;
2735     SDValue ALVal;
2736     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2737       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2738       LiveGPRs.push_back(
2739           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2740     }
2741     if (!ArgXMMs.empty()) {
2742       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2743       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2744       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2745         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2746         LiveXMMRegs.push_back(
2747             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2748       }
2749     }
2750 
2751     if (IsWin64) {
2752       // Get to the caller-allocated home save location.  Add 8 to account
2753       // for the return address.
2754       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2755       FuncInfo->setRegSaveFrameIndex(
2756           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2757       // Fixup to set vararg frame on shadow area (4 x i64).
2758       if (NumIntRegs < 4)
2759         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2760     } else {
2761       // For X86-64, if there are vararg parameters that are passed via
2762       // registers, then we must store them to their spots on the stack so
2763       // they may be loaded by dereferencing the result of va_next.
2764       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2765       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2766       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2767           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2768     }
2769 
2770     // Store the integer parameter registers.
2771     SmallVector<SDValue, 8> MemOps;
2772     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2773                                       getPointerTy(DAG.getDataLayout()));
2774     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2775     for (SDValue Val : LiveGPRs) {
2776       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2777                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
2778       SDValue Store =
2779           DAG.getStore(Val.getValue(1), dl, Val, FIN,
2780                        MachinePointerInfo::getFixedStack(
2781                            DAG.getMachineFunction(),
2782                            FuncInfo->getRegSaveFrameIndex(), Offset),
2783                        false, false, 0);
2784       MemOps.push_back(Store);
2785       Offset += 8;
2786     }
2787 
2788     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2789       // Now store the XMM (fp + vector) parameter registers.
2790       SmallVector<SDValue, 12> SaveXMMOps;
2791       SaveXMMOps.push_back(Chain);
2792       SaveXMMOps.push_back(ALVal);
2793       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2794                              FuncInfo->getRegSaveFrameIndex(), dl));
2795       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2796                              FuncInfo->getVarArgsFPOffset(), dl));
2797       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2798                         LiveXMMRegs.end());
2799       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2800                                    MVT::Other, SaveXMMOps));
2801     }
2802 
2803     if (!MemOps.empty())
2804       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2805   }
2806 
2807   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2808     // Find the largest legal vector type.
2809     MVT VecVT = MVT::Other;
2810     // FIXME: Only some x86_32 calling conventions support AVX512.
2811     if (Subtarget.hasAVX512() &&
2812         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2813                      CallConv == CallingConv::Intel_OCL_BI)))
2814       VecVT = MVT::v16f32;
2815     else if (Subtarget.hasAVX())
2816       VecVT = MVT::v8f32;
2817     else if (Subtarget.hasSSE2())
2818       VecVT = MVT::v4f32;
2819 
2820     // We forward some GPRs and some vector types.
2821     SmallVector<MVT, 2> RegParmTypes;
2822     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2823     RegParmTypes.push_back(IntVT);
2824     if (VecVT != MVT::Other)
2825       RegParmTypes.push_back(VecVT);
2826 
2827     // Compute the set of forwarded registers. The rest are scratch.
2828     SmallVectorImpl<ForwardedRegister> &Forwards =
2829         FuncInfo->getForwardedMustTailRegParms();
2830     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2831 
2832     // Conservatively forward AL on x86_64, since it might be used for varargs.
2833     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2834       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2835       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2836     }
2837 
2838     // Copy all forwards from physical to virtual registers.
2839     for (ForwardedRegister &F : Forwards) {
2840       // FIXME: Can we use a less constrained schedule?
2841       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2842       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2843       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2844     }
2845   }
2846 
2847   // Some CCs need callee pop.
2848   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2849                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2850     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2851   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
2852     // X86 interrupts must pop the error code if present
2853     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
2854   } else {
2855     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2856     // If this is an sret function, the return should pop the hidden pointer.
2857     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
2858         !Subtarget.getTargetTriple().isOSMSVCRT() &&
2859         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
2860       FuncInfo->setBytesToPopOnReturn(4);
2861   }
2862 
2863   if (!Is64Bit) {
2864     // RegSaveFrameIndex is X86-64 only.
2865     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2866     if (CallConv == CallingConv::X86_FastCall ||
2867         CallConv == CallingConv::X86_ThisCall)
2868       // fastcc functions can't have varargs.
2869       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2870   }
2871 
2872   FuncInfo->setArgumentStackSize(StackSize);
2873 
2874   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
2875     EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
2876     if (Personality == EHPersonality::CoreCLR) {
2877       assert(Is64Bit);
2878       // TODO: Add a mechanism to frame lowering that will allow us to indicate
2879       // that we'd prefer this slot be allocated towards the bottom of the frame
2880       // (i.e. near the stack pointer after allocating the frame).  Every
2881       // funclet needs a copy of this slot in its (mostly empty) frame, and the
2882       // offset from the bottom of this and each funclet's frame must be the
2883       // same, so the size of funclets' (mostly empty) frames is dictated by
2884       // how far this slot is from the bottom (since they allocate just enough
2885       // space to accommodate holding this slot at the correct offset).
2886       int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
2887       EHInfo->PSPSymFrameIdx = PSPSymFI;
2888     }
2889   }
2890 
2891   return Chain;
2892 }
2893 
LowerMemOpCallTo(SDValue Chain,SDValue StackPtr,SDValue Arg,const SDLoc & dl,SelectionDAG & DAG,const CCValAssign & VA,ISD::ArgFlagsTy Flags) const2894 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
2895                                             SDValue Arg, const SDLoc &dl,
2896                                             SelectionDAG &DAG,
2897                                             const CCValAssign &VA,
2898                                             ISD::ArgFlagsTy Flags) const {
2899   unsigned LocMemOffset = VA.getLocMemOffset();
2900   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2901   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2902                        StackPtr, PtrOff);
2903   if (Flags.isByVal())
2904     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2905 
2906   return DAG.getStore(
2907       Chain, dl, Arg, PtrOff,
2908       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
2909       false, false, 0);
2910 }
2911 
2912 /// Emit a load of return address if tail call
2913 /// optimization is performed and it is required.
EmitTailCallLoadRetAddr(SelectionDAG & DAG,SDValue & OutRetAddr,SDValue Chain,bool IsTailCall,bool Is64Bit,int FPDiff,const SDLoc & dl) const2914 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
2915     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
2916     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
2917   // Adjust the Return address stack slot.
2918   EVT VT = getPointerTy(DAG.getDataLayout());
2919   OutRetAddr = getReturnAddressFrameIndex(DAG);
2920 
2921   // Load the "old" Return address.
2922   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2923                            false, false, false, 0);
2924   return SDValue(OutRetAddr.getNode(), 1);
2925 }
2926 
2927 /// Emit a store of the return address if tail call
2928 /// optimization is performed and it is required (FPDiff!=0).
EmitTailCallStoreRetAddr(SelectionDAG & DAG,MachineFunction & MF,SDValue Chain,SDValue RetAddrFrIdx,EVT PtrVT,unsigned SlotSize,int FPDiff,const SDLoc & dl)2929 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2930                                         SDValue Chain, SDValue RetAddrFrIdx,
2931                                         EVT PtrVT, unsigned SlotSize,
2932                                         int FPDiff, const SDLoc &dl) {
2933   // Store the return address to the appropriate stack slot.
2934   if (!FPDiff) return Chain;
2935   // Calculate the new stack slot for the return address.
2936   int NewReturnAddrFI =
2937     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2938                                          false);
2939   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2940   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2941                        MachinePointerInfo::getFixedStack(
2942                            DAG.getMachineFunction(), NewReturnAddrFI),
2943                        false, false, 0);
2944   return Chain;
2945 }
2946 
2947 /// Returns a vector_shuffle mask for an movs{s|d}, movd
2948 /// operation of specified width.
getMOVL(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)2949 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
2950                        SDValue V2) {
2951   unsigned NumElems = VT.getVectorNumElements();
2952   SmallVector<int, 8> Mask;
2953   Mask.push_back(NumElems);
2954   for (unsigned i = 1; i != NumElems; ++i)
2955     Mask.push_back(i);
2956   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
2957 }
2958 
2959 SDValue
LowerCall(TargetLowering::CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const2960 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2961                              SmallVectorImpl<SDValue> &InVals) const {
2962   SelectionDAG &DAG                     = CLI.DAG;
2963   SDLoc &dl                             = CLI.DL;
2964   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2965   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2966   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2967   SDValue Chain                         = CLI.Chain;
2968   SDValue Callee                        = CLI.Callee;
2969   CallingConv::ID CallConv              = CLI.CallConv;
2970   bool &isTailCall                      = CLI.IsTailCall;
2971   bool isVarArg                         = CLI.IsVarArg;
2972 
2973   MachineFunction &MF = DAG.getMachineFunction();
2974   bool Is64Bit        = Subtarget.is64Bit();
2975   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
2976   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
2977   bool IsSibcall      = false;
2978   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2979   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
2980 
2981   if (CallConv == CallingConv::X86_INTR)
2982     report_fatal_error("X86 interrupts may not be called directly");
2983 
2984   if (Attr.getValueAsString() == "true")
2985     isTailCall = false;
2986 
2987   if (Subtarget.isPICStyleGOT() &&
2988       !MF.getTarget().Options.GuaranteedTailCallOpt) {
2989     // If we are using a GOT, disable tail calls to external symbols with
2990     // default visibility. Tail calling such a symbol requires using a GOT
2991     // relocation, which forces early binding of the symbol. This breaks code
2992     // that require lazy function symbol resolution. Using musttail or
2993     // GuaranteedTailCallOpt will override this.
2994     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2995     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2996                G->getGlobal()->hasDefaultVisibility()))
2997       isTailCall = false;
2998   }
2999 
3000   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3001   if (IsMustTail) {
3002     // Force this to be a tail call.  The verifier rules are enough to ensure
3003     // that we can lower this successfully without moving the return address
3004     // around.
3005     isTailCall = true;
3006   } else if (isTailCall) {
3007     // Check if it's really possible to do a tail call.
3008     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3009                     isVarArg, SR != NotStructReturn,
3010                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3011                     Outs, OutVals, Ins, DAG);
3012 
3013     // Sibcalls are automatically detected tailcalls which do not require
3014     // ABI changes.
3015     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3016       IsSibcall = true;
3017 
3018     if (isTailCall)
3019       ++NumTailCalls;
3020   }
3021 
3022   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3023          "Var args not supported with calling convention fastcc, ghc or hipe");
3024 
3025   // Analyze operands of the call, assigning locations to each operand.
3026   SmallVector<CCValAssign, 16> ArgLocs;
3027   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3028 
3029   // Allocate shadow area for Win64
3030   if (IsWin64)
3031     CCInfo.AllocateStack(32, 8);
3032 
3033   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3034 
3035   // Get a count of how many bytes are to be pushed on the stack.
3036   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3037   if (IsSibcall)
3038     // This is a sibcall. The memory operands are available in caller's
3039     // own caller's stack.
3040     NumBytes = 0;
3041   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3042            canGuaranteeTCO(CallConv))
3043     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3044 
3045   int FPDiff = 0;
3046   if (isTailCall && !IsSibcall && !IsMustTail) {
3047     // Lower arguments at fp - stackoffset + fpdiff.
3048     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3049 
3050     FPDiff = NumBytesCallerPushed - NumBytes;
3051 
3052     // Set the delta of movement of the returnaddr stackslot.
3053     // But only set if delta is greater than previous delta.
3054     if (FPDiff < X86Info->getTCReturnAddrDelta())
3055       X86Info->setTCReturnAddrDelta(FPDiff);
3056   }
3057 
3058   unsigned NumBytesToPush = NumBytes;
3059   unsigned NumBytesToPop = NumBytes;
3060 
3061   // If we have an inalloca argument, all stack space has already been allocated
3062   // for us and be right at the top of the stack.  We don't support multiple
3063   // arguments passed in memory when using inalloca.
3064   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3065     NumBytesToPush = 0;
3066     if (!ArgLocs.back().isMemLoc())
3067       report_fatal_error("cannot use inalloca attribute on a register "
3068                          "parameter");
3069     if (ArgLocs.back().getLocMemOffset() != 0)
3070       report_fatal_error("any parameter with the inalloca attribute must be "
3071                          "the only memory argument");
3072   }
3073 
3074   if (!IsSibcall)
3075     Chain = DAG.getCALLSEQ_START(
3076         Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3077 
3078   SDValue RetAddrFrIdx;
3079   // Load return address for tail calls.
3080   if (isTailCall && FPDiff)
3081     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3082                                     Is64Bit, FPDiff, dl);
3083 
3084   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3085   SmallVector<SDValue, 8> MemOpChains;
3086   SDValue StackPtr;
3087 
3088   // Walk the register/memloc assignments, inserting copies/loads.  In the case
3089   // of tail call optimization arguments are handle later.
3090   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3091   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3092     // Skip inalloca arguments, they have already been written.
3093     ISD::ArgFlagsTy Flags = Outs[i].Flags;
3094     if (Flags.isInAlloca())
3095       continue;
3096 
3097     CCValAssign &VA = ArgLocs[i];
3098     EVT RegVT = VA.getLocVT();
3099     SDValue Arg = OutVals[i];
3100     bool isByVal = Flags.isByVal();
3101 
3102     // Promote the value if needed.
3103     switch (VA.getLocInfo()) {
3104     default: llvm_unreachable("Unknown loc info!");
3105     case CCValAssign::Full: break;
3106     case CCValAssign::SExt:
3107       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3108       break;
3109     case CCValAssign::ZExt:
3110       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3111       break;
3112     case CCValAssign::AExt:
3113       if (Arg.getValueType().isVector() &&
3114           Arg.getValueType().getVectorElementType() == MVT::i1)
3115         Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3116       else if (RegVT.is128BitVector()) {
3117         // Special case: passing MMX values in XMM registers.
3118         Arg = DAG.getBitcast(MVT::i64, Arg);
3119         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3120         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3121       } else
3122         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3123       break;
3124     case CCValAssign::BCvt:
3125       Arg = DAG.getBitcast(RegVT, Arg);
3126       break;
3127     case CCValAssign::Indirect: {
3128       // Store the argument.
3129       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3130       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3131       Chain = DAG.getStore(
3132           Chain, dl, Arg, SpillSlot,
3133           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3134           false, false, 0);
3135       Arg = SpillSlot;
3136       break;
3137     }
3138     }
3139 
3140     if (VA.isRegLoc()) {
3141       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3142       if (isVarArg && IsWin64) {
3143         // Win64 ABI requires argument XMM reg to be copied to the corresponding
3144         // shadow reg if callee is a varargs function.
3145         unsigned ShadowReg = 0;
3146         switch (VA.getLocReg()) {
3147         case X86::XMM0: ShadowReg = X86::RCX; break;
3148         case X86::XMM1: ShadowReg = X86::RDX; break;
3149         case X86::XMM2: ShadowReg = X86::R8; break;
3150         case X86::XMM3: ShadowReg = X86::R9; break;
3151         }
3152         if (ShadowReg)
3153           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3154       }
3155     } else if (!IsSibcall && (!isTailCall || isByVal)) {
3156       assert(VA.isMemLoc());
3157       if (!StackPtr.getNode())
3158         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3159                                       getPointerTy(DAG.getDataLayout()));
3160       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3161                                              dl, DAG, VA, Flags));
3162     }
3163   }
3164 
3165   if (!MemOpChains.empty())
3166     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3167 
3168   if (Subtarget.isPICStyleGOT()) {
3169     // ELF / PIC requires GOT in the EBX register before function calls via PLT
3170     // GOT pointer.
3171     if (!isTailCall) {
3172       RegsToPass.push_back(std::make_pair(
3173           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3174                                           getPointerTy(DAG.getDataLayout()))));
3175     } else {
3176       // If we are tail calling and generating PIC/GOT style code load the
3177       // address of the callee into ECX. The value in ecx is used as target of
3178       // the tail jump. This is done to circumvent the ebx/callee-saved problem
3179       // for tail calls on PIC/GOT architectures. Normally we would just put the
3180       // address of GOT into ebx and then call target@PLT. But for tail calls
3181       // ebx would be restored (since ebx is callee saved) before jumping to the
3182       // target@PLT.
3183 
3184       // Note: The actual moving to ECX is done further down.
3185       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3186       if (G && !G->getGlobal()->hasLocalLinkage() &&
3187           G->getGlobal()->hasDefaultVisibility())
3188         Callee = LowerGlobalAddress(Callee, DAG);
3189       else if (isa<ExternalSymbolSDNode>(Callee))
3190         Callee = LowerExternalSymbol(Callee, DAG);
3191     }
3192   }
3193 
3194   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3195     // From AMD64 ABI document:
3196     // For calls that may call functions that use varargs or stdargs
3197     // (prototype-less calls or calls to functions containing ellipsis (...) in
3198     // the declaration) %al is used as hidden argument to specify the number
3199     // of SSE registers used. The contents of %al do not need to match exactly
3200     // the number of registers, but must be an ubound on the number of SSE
3201     // registers used and is in the range 0 - 8 inclusive.
3202 
3203     // Count the number of XMM registers allocated.
3204     static const MCPhysReg XMMArgRegs[] = {
3205       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3206       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3207     };
3208     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3209     assert((Subtarget.hasSSE1() || !NumXMMRegs)
3210            && "SSE registers cannot be used when SSE is disabled");
3211 
3212     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3213                                         DAG.getConstant(NumXMMRegs, dl,
3214                                                         MVT::i8)));
3215   }
3216 
3217   if (isVarArg && IsMustTail) {
3218     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3219     for (const auto &F : Forwards) {
3220       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3221       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3222     }
3223   }
3224 
3225   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3226   // don't need this because the eligibility check rejects calls that require
3227   // shuffling arguments passed in memory.
3228   if (!IsSibcall && isTailCall) {
3229     // Force all the incoming stack arguments to be loaded from the stack
3230     // before any new outgoing arguments are stored to the stack, because the
3231     // outgoing stack slots may alias the incoming argument stack slots, and
3232     // the alias isn't otherwise explicit. This is slightly more conservative
3233     // than necessary, because it means that each store effectively depends
3234     // on every argument instead of just those arguments it would clobber.
3235     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3236 
3237     SmallVector<SDValue, 8> MemOpChains2;
3238     SDValue FIN;
3239     int FI = 0;
3240     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3241       CCValAssign &VA = ArgLocs[i];
3242       if (VA.isRegLoc())
3243         continue;
3244       assert(VA.isMemLoc());
3245       SDValue Arg = OutVals[i];
3246       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3247       // Skip inalloca arguments.  They don't require any work.
3248       if (Flags.isInAlloca())
3249         continue;
3250       // Create frame index.
3251       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3252       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3253       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3254       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3255 
3256       if (Flags.isByVal()) {
3257         // Copy relative to framepointer.
3258         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3259         if (!StackPtr.getNode())
3260           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3261                                         getPointerTy(DAG.getDataLayout()));
3262         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3263                              StackPtr, Source);
3264 
3265         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3266                                                          ArgChain,
3267                                                          Flags, DAG, dl));
3268       } else {
3269         // Store relative to framepointer.
3270         MemOpChains2.push_back(DAG.getStore(
3271             ArgChain, dl, Arg, FIN,
3272             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3273             false, false, 0));
3274       }
3275     }
3276 
3277     if (!MemOpChains2.empty())
3278       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3279 
3280     // Store the return address to the appropriate stack slot.
3281     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3282                                      getPointerTy(DAG.getDataLayout()),
3283                                      RegInfo->getSlotSize(), FPDiff, dl);
3284   }
3285 
3286   // Build a sequence of copy-to-reg nodes chained together with token chain
3287   // and flag operands which copy the outgoing args into registers.
3288   SDValue InFlag;
3289   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3290     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3291                              RegsToPass[i].second, InFlag);
3292     InFlag = Chain.getValue(1);
3293   }
3294 
3295   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3296     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3297     // In the 64-bit large code model, we have to make all calls
3298     // through a register, since the call instruction's 32-bit
3299     // pc-relative offset may not be large enough to hold the whole
3300     // address.
3301   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3302     // If the callee is a GlobalAddress node (quite common, every direct call
3303     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3304     // it.
3305     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3306 
3307     // We should use extra load for direct calls to dllimported functions in
3308     // non-JIT mode.
3309     const GlobalValue *GV = G->getGlobal();
3310     if (!GV->hasDLLImportStorageClass()) {
3311       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3312 
3313       Callee = DAG.getTargetGlobalAddress(
3314           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3315 
3316       if (OpFlags == X86II::MO_GOTPCREL) {
3317         // Add a wrapper.
3318         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3319           getPointerTy(DAG.getDataLayout()), Callee);
3320         // Add extra indirection
3321         Callee = DAG.getLoad(
3322           getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3323           MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
3324           false, 0);
3325       }
3326     }
3327   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3328     const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3329     unsigned char OpFlags =
3330         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3331 
3332     Callee = DAG.getTargetExternalSymbol(
3333         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3334   } else if (Subtarget.isTarget64BitILP32() &&
3335              Callee->getValueType(0) == MVT::i32) {
3336     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3337     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3338   }
3339 
3340   // Returns a chain & a flag for retval copy to use.
3341   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3342   SmallVector<SDValue, 8> Ops;
3343 
3344   if (!IsSibcall && isTailCall) {
3345     Chain = DAG.getCALLSEQ_END(Chain,
3346                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3347                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3348     InFlag = Chain.getValue(1);
3349   }
3350 
3351   Ops.push_back(Chain);
3352   Ops.push_back(Callee);
3353 
3354   if (isTailCall)
3355     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3356 
3357   // Add argument registers to the end of the list so that they are known live
3358   // into the call.
3359   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3360     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3361                                   RegsToPass[i].second.getValueType()));
3362 
3363   // Add a register mask operand representing the call-preserved registers.
3364   const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3365   assert(Mask && "Missing call preserved mask for calling convention");
3366 
3367   // If this is an invoke in a 32-bit function using a funclet-based
3368   // personality, assume the function clobbers all registers. If an exception
3369   // is thrown, the runtime will not restore CSRs.
3370   // FIXME: Model this more precisely so that we can register allocate across
3371   // the normal edge and spill and fill across the exceptional edge.
3372   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3373     const Function *CallerFn = MF.getFunction();
3374     EHPersonality Pers =
3375         CallerFn->hasPersonalityFn()
3376             ? classifyEHPersonality(CallerFn->getPersonalityFn())
3377             : EHPersonality::Unknown;
3378     if (isFuncletEHPersonality(Pers))
3379       Mask = RegInfo->getNoPreservedMask();
3380   }
3381 
3382   Ops.push_back(DAG.getRegisterMask(Mask));
3383 
3384   if (InFlag.getNode())
3385     Ops.push_back(InFlag);
3386 
3387   if (isTailCall) {
3388     // We used to do:
3389     //// If this is the first return lowered for this function, add the regs
3390     //// to the liveout set for the function.
3391     // This isn't right, although it's probably harmless on x86; liveouts
3392     // should be computed from returns not tail calls.  Consider a void
3393     // function making a tail call to a function returning int.
3394     MF.getFrameInfo()->setHasTailCall();
3395     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3396   }
3397 
3398   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3399   InFlag = Chain.getValue(1);
3400 
3401   // Create the CALLSEQ_END node.
3402   unsigned NumBytesForCalleeToPop;
3403   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3404                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3405     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3406   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3407            !Subtarget.getTargetTriple().isOSMSVCRT() &&
3408            SR == StackStructReturn)
3409     // If this is a call to a struct-return function, the callee
3410     // pops the hidden struct pointer, so we have to push it back.
3411     // This is common for Darwin/X86, Linux & Mingw32 targets.
3412     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3413     NumBytesForCalleeToPop = 4;
3414   else
3415     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3416 
3417   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3418     // No need to reset the stack after the call if the call doesn't return. To
3419     // make the MI verify, we'll pretend the callee does it for us.
3420     NumBytesForCalleeToPop = NumBytes;
3421   }
3422 
3423   // Returns a flag for retval copy to use.
3424   if (!IsSibcall) {
3425     Chain = DAG.getCALLSEQ_END(Chain,
3426                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3427                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3428                                                      true),
3429                                InFlag, dl);
3430     InFlag = Chain.getValue(1);
3431   }
3432 
3433   // Handle result values, copying them out of physregs into vregs that we
3434   // return.
3435   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3436                          Ins, dl, DAG, InVals);
3437 }
3438 
3439 //===----------------------------------------------------------------------===//
3440 //                Fast Calling Convention (tail call) implementation
3441 //===----------------------------------------------------------------------===//
3442 
3443 //  Like std call, callee cleans arguments, convention except that ECX is
3444 //  reserved for storing the tail called function address. Only 2 registers are
3445 //  free for argument passing (inreg). Tail call optimization is performed
3446 //  provided:
3447 //                * tailcallopt is enabled
3448 //                * caller/callee are fastcc
3449 //  On X86_64 architecture with GOT-style position independent code only local
3450 //  (within module) calls are supported at the moment.
3451 //  To keep the stack aligned according to platform abi the function
3452 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3453 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3454 //  If a tail called function callee has more arguments than the caller the
3455 //  caller needs to make sure that there is room to move the RETADDR to. This is
3456 //  achieved by reserving an area the size of the argument delta right after the
3457 //  original RETADDR, but before the saved framepointer or the spilled registers
3458 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3459 //  stack layout:
3460 //    arg1
3461 //    arg2
3462 //    RETADDR
3463 //    [ new RETADDR
3464 //      move area ]
3465 //    (possible EBP)
3466 //    ESI
3467 //    EDI
3468 //    local1 ..
3469 
3470 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3471 /// requirement.
3472 unsigned
GetAlignedArgumentStackSize(unsigned StackSize,SelectionDAG & DAG) const3473 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3474                                                SelectionDAG& DAG) const {
3475   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3476   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3477   unsigned StackAlignment = TFI.getStackAlignment();
3478   uint64_t AlignMask = StackAlignment - 1;
3479   int64_t Offset = StackSize;
3480   unsigned SlotSize = RegInfo->getSlotSize();
3481   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3482     // Number smaller than 12 so just add the difference.
3483     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3484   } else {
3485     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3486     Offset = ((~AlignMask) & Offset) + StackAlignment +
3487       (StackAlignment-SlotSize);
3488   }
3489   return Offset;
3490 }
3491 
3492 /// Return true if the given stack call argument is already available in the
3493 /// same position (relatively) of the caller's incoming argument stack.
3494 static
MatchingStackOffset(SDValue Arg,unsigned Offset,ISD::ArgFlagsTy Flags,MachineFrameInfo * MFI,const MachineRegisterInfo * MRI,const X86InstrInfo * TII,const CCValAssign & VA)3495 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3496                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3497                          const X86InstrInfo *TII, const CCValAssign &VA) {
3498   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3499 
3500   for (;;) {
3501     // Look through nodes that don't alter the bits of the incoming value.
3502     unsigned Op = Arg.getOpcode();
3503     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3504       Arg = Arg.getOperand(0);
3505       continue;
3506     }
3507     if (Op == ISD::TRUNCATE) {
3508       const SDValue &TruncInput = Arg.getOperand(0);
3509       if (TruncInput.getOpcode() == ISD::AssertZext &&
3510           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3511               Arg.getValueType()) {
3512         Arg = TruncInput.getOperand(0);
3513         continue;
3514       }
3515     }
3516     break;
3517   }
3518 
3519   int FI = INT_MAX;
3520   if (Arg.getOpcode() == ISD::CopyFromReg) {
3521     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3522     if (!TargetRegisterInfo::isVirtualRegister(VR))
3523       return false;
3524     MachineInstr *Def = MRI->getVRegDef(VR);
3525     if (!Def)
3526       return false;
3527     if (!Flags.isByVal()) {
3528       if (!TII->isLoadFromStackSlot(*Def, FI))
3529         return false;
3530     } else {
3531       unsigned Opcode = Def->getOpcode();
3532       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3533            Opcode == X86::LEA64_32r) &&
3534           Def->getOperand(1).isFI()) {
3535         FI = Def->getOperand(1).getIndex();
3536         Bytes = Flags.getByValSize();
3537       } else
3538         return false;
3539     }
3540   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3541     if (Flags.isByVal())
3542       // ByVal argument is passed in as a pointer but it's now being
3543       // dereferenced. e.g.
3544       // define @foo(%struct.X* %A) {
3545       //   tail call @bar(%struct.X* byval %A)
3546       // }
3547       return false;
3548     SDValue Ptr = Ld->getBasePtr();
3549     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3550     if (!FINode)
3551       return false;
3552     FI = FINode->getIndex();
3553   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3554     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3555     FI = FINode->getIndex();
3556     Bytes = Flags.getByValSize();
3557   } else
3558     return false;
3559 
3560   assert(FI != INT_MAX);
3561   if (!MFI->isFixedObjectIndex(FI))
3562     return false;
3563 
3564   if (Offset != MFI->getObjectOffset(FI))
3565     return false;
3566 
3567   if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
3568     // If the argument location is wider than the argument type, check that any
3569     // extension flags match.
3570     if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
3571         Flags.isSExt() != MFI->isObjectSExt(FI)) {
3572       return false;
3573     }
3574   }
3575 
3576   return Bytes == MFI->getObjectSize(FI);
3577 }
3578 
3579 /// Check whether the call is eligible for tail call optimization. Targets
3580 /// that want to do tail call optimization should implement this function.
IsEligibleForTailCallOptimization(SDValue Callee,CallingConv::ID CalleeCC,bool isVarArg,bool isCalleeStructRet,bool isCallerStructRet,Type * RetTy,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SmallVectorImpl<ISD::InputArg> & Ins,SelectionDAG & DAG) const3581 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3582     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3583     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3584     const SmallVectorImpl<ISD::OutputArg> &Outs,
3585     const SmallVectorImpl<SDValue> &OutVals,
3586     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3587   if (!mayTailCallThisCC(CalleeCC))
3588     return false;
3589 
3590   // If -tailcallopt is specified, make fastcc functions tail-callable.
3591   MachineFunction &MF = DAG.getMachineFunction();
3592   const Function *CallerF = MF.getFunction();
3593 
3594   // If the function return type is x86_fp80 and the callee return type is not,
3595   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3596   // perform a tailcall optimization here.
3597   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3598     return false;
3599 
3600   CallingConv::ID CallerCC = CallerF->getCallingConv();
3601   bool CCMatch = CallerCC == CalleeCC;
3602   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3603   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3604 
3605   // Win64 functions have extra shadow space for argument homing. Don't do the
3606   // sibcall if the caller and callee have mismatched expectations for this
3607   // space.
3608   if (IsCalleeWin64 != IsCallerWin64)
3609     return false;
3610 
3611   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3612     if (canGuaranteeTCO(CalleeCC) && CCMatch)
3613       return true;
3614     return false;
3615   }
3616 
3617   // Look for obvious safe cases to perform tail call optimization that do not
3618   // require ABI changes. This is what gcc calls sibcall.
3619 
3620   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3621   // emit a special epilogue.
3622   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3623   if (RegInfo->needsStackRealignment(MF))
3624     return false;
3625 
3626   // Also avoid sibcall optimization if either caller or callee uses struct
3627   // return semantics.
3628   if (isCalleeStructRet || isCallerStructRet)
3629     return false;
3630 
3631   // Do not sibcall optimize vararg calls unless all arguments are passed via
3632   // registers.
3633   LLVMContext &C = *DAG.getContext();
3634   if (isVarArg && !Outs.empty()) {
3635     // Optimizing for varargs on Win64 is unlikely to be safe without
3636     // additional testing.
3637     if (IsCalleeWin64 || IsCallerWin64)
3638       return false;
3639 
3640     SmallVector<CCValAssign, 16> ArgLocs;
3641     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3642 
3643     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3644     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3645       if (!ArgLocs[i].isRegLoc())
3646         return false;
3647   }
3648 
3649   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3650   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3651   // this into a sibcall.
3652   bool Unused = false;
3653   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3654     if (!Ins[i].Used) {
3655       Unused = true;
3656       break;
3657     }
3658   }
3659   if (Unused) {
3660     SmallVector<CCValAssign, 16> RVLocs;
3661     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3662     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3663     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3664       CCValAssign &VA = RVLocs[i];
3665       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3666         return false;
3667     }
3668   }
3669 
3670   // Check that the call results are passed in the same way.
3671   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3672                                   RetCC_X86, RetCC_X86))
3673     return false;
3674   // The callee has to preserve all registers the caller needs to preserve.
3675   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3676   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3677   if (!CCMatch) {
3678     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3679     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3680       return false;
3681   }
3682 
3683   unsigned StackArgsSize = 0;
3684 
3685   // If the callee takes no arguments then go on to check the results of the
3686   // call.
3687   if (!Outs.empty()) {
3688     // Check if stack adjustment is needed. For now, do not do this if any
3689     // argument is passed on the stack.
3690     SmallVector<CCValAssign, 16> ArgLocs;
3691     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3692 
3693     // Allocate shadow area for Win64
3694     if (IsCalleeWin64)
3695       CCInfo.AllocateStack(32, 8);
3696 
3697     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3698     StackArgsSize = CCInfo.getNextStackOffset();
3699 
3700     if (CCInfo.getNextStackOffset()) {
3701       // Check if the arguments are already laid out in the right way as
3702       // the caller's fixed stack objects.
3703       MachineFrameInfo *MFI = MF.getFrameInfo();
3704       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3705       const X86InstrInfo *TII = Subtarget.getInstrInfo();
3706       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3707         CCValAssign &VA = ArgLocs[i];
3708         SDValue Arg = OutVals[i];
3709         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3710         if (VA.getLocInfo() == CCValAssign::Indirect)
3711           return false;
3712         if (!VA.isRegLoc()) {
3713           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3714                                    MFI, MRI, TII, VA))
3715             return false;
3716         }
3717       }
3718     }
3719 
3720     bool PositionIndependent = isPositionIndependent();
3721     // If the tailcall address may be in a register, then make sure it's
3722     // possible to register allocate for it. In 32-bit, the call address can
3723     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3724     // callee-saved registers are restored. These happen to be the same
3725     // registers used to pass 'inreg' arguments so watch out for those.
3726     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
3727                                   !isa<ExternalSymbolSDNode>(Callee)) ||
3728                                  PositionIndependent)) {
3729       unsigned NumInRegs = 0;
3730       // In PIC we need an extra register to formulate the address computation
3731       // for the callee.
3732       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
3733 
3734       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3735         CCValAssign &VA = ArgLocs[i];
3736         if (!VA.isRegLoc())
3737           continue;
3738         unsigned Reg = VA.getLocReg();
3739         switch (Reg) {
3740         default: break;
3741         case X86::EAX: case X86::EDX: case X86::ECX:
3742           if (++NumInRegs == MaxInRegs)
3743             return false;
3744           break;
3745         }
3746       }
3747     }
3748 
3749     const MachineRegisterInfo &MRI = MF.getRegInfo();
3750     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3751       return false;
3752   }
3753 
3754   bool CalleeWillPop =
3755       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
3756                        MF.getTarget().Options.GuaranteedTailCallOpt);
3757 
3758   if (unsigned BytesToPop =
3759           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
3760     // If we have bytes to pop, the callee must pop them.
3761     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
3762     if (!CalleePopMatches)
3763       return false;
3764   } else if (CalleeWillPop && StackArgsSize > 0) {
3765     // If we don't have bytes to pop, make sure the callee doesn't pop any.
3766     return false;
3767   }
3768 
3769   return true;
3770 }
3771 
3772 FastISel *
createFastISel(FunctionLoweringInfo & funcInfo,const TargetLibraryInfo * libInfo) const3773 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3774                                   const TargetLibraryInfo *libInfo) const {
3775   return X86::createFastISel(funcInfo, libInfo);
3776 }
3777 
3778 //===----------------------------------------------------------------------===//
3779 //                           Other Lowering Hooks
3780 //===----------------------------------------------------------------------===//
3781 
MayFoldLoad(SDValue Op)3782 static bool MayFoldLoad(SDValue Op) {
3783   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3784 }
3785 
MayFoldIntoStore(SDValue Op)3786 static bool MayFoldIntoStore(SDValue Op) {
3787   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3788 }
3789 
isTargetShuffle(unsigned Opcode)3790 static bool isTargetShuffle(unsigned Opcode) {
3791   switch(Opcode) {
3792   default: return false;
3793   case X86ISD::BLENDI:
3794   case X86ISD::PSHUFB:
3795   case X86ISD::PSHUFD:
3796   case X86ISD::PSHUFHW:
3797   case X86ISD::PSHUFLW:
3798   case X86ISD::SHUFP:
3799   case X86ISD::INSERTPS:
3800   case X86ISD::PALIGNR:
3801   case X86ISD::VSHLDQ:
3802   case X86ISD::VSRLDQ:
3803   case X86ISD::MOVLHPS:
3804   case X86ISD::MOVLHPD:
3805   case X86ISD::MOVHLPS:
3806   case X86ISD::MOVLPS:
3807   case X86ISD::MOVLPD:
3808   case X86ISD::MOVSHDUP:
3809   case X86ISD::MOVSLDUP:
3810   case X86ISD::MOVDDUP:
3811   case X86ISD::MOVSS:
3812   case X86ISD::MOVSD:
3813   case X86ISD::UNPCKL:
3814   case X86ISD::UNPCKH:
3815   case X86ISD::VPERMILPI:
3816   case X86ISD::VPERMILPV:
3817   case X86ISD::VPERM2X128:
3818   case X86ISD::VPERMIL2:
3819   case X86ISD::VPERMI:
3820   case X86ISD::VPPERM:
3821   case X86ISD::VPERMV:
3822   case X86ISD::VPERMV3:
3823   case X86ISD::VZEXT_MOVL:
3824     return true;
3825   }
3826 }
3827 
isTargetShuffleVariableMask(unsigned Opcode)3828 static bool isTargetShuffleVariableMask(unsigned Opcode) {
3829   switch (Opcode) {
3830   default: return false;
3831   case X86ISD::PSHUFB:
3832   case X86ISD::VPERMILPV:
3833     return true;
3834   }
3835 }
3836 
getTargetShuffleNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue V1,unsigned TargetMask,SelectionDAG & DAG)3837 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3838                                     SDValue V1, unsigned TargetMask,
3839                                     SelectionDAG &DAG) {
3840   switch(Opc) {
3841   default: llvm_unreachable("Unknown x86 shuffle node");
3842   case X86ISD::PSHUFD:
3843   case X86ISD::PSHUFHW:
3844   case X86ISD::PSHUFLW:
3845   case X86ISD::VPERMILPI:
3846   case X86ISD::VPERMI:
3847     return DAG.getNode(Opc, dl, VT, V1,
3848                        DAG.getConstant(TargetMask, dl, MVT::i8));
3849   }
3850 }
3851 
getTargetShuffleNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2,SelectionDAG & DAG)3852 static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3853                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3854   switch(Opc) {
3855   default: llvm_unreachable("Unknown x86 shuffle node");
3856   case X86ISD::MOVLHPS:
3857   case X86ISD::MOVLHPD:
3858   case X86ISD::MOVHLPS:
3859   case X86ISD::MOVLPS:
3860   case X86ISD::MOVLPD:
3861   case X86ISD::MOVSS:
3862   case X86ISD::MOVSD:
3863   case X86ISD::UNPCKL:
3864   case X86ISD::UNPCKH:
3865     return DAG.getNode(Opc, dl, VT, V1, V2);
3866   }
3867 }
3868 
getReturnAddressFrameIndex(SelectionDAG & DAG) const3869 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3870   MachineFunction &MF = DAG.getMachineFunction();
3871   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3872   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3873   int ReturnAddrIndex = FuncInfo->getRAIndex();
3874 
3875   if (ReturnAddrIndex == 0) {
3876     // Set up a frame object for the return address.
3877     unsigned SlotSize = RegInfo->getSlotSize();
3878     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3879                                                            -(int64_t)SlotSize,
3880                                                            false);
3881     FuncInfo->setRAIndex(ReturnAddrIndex);
3882   }
3883 
3884   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3885 }
3886 
isOffsetSuitableForCodeModel(int64_t Offset,CodeModel::Model M,bool hasSymbolicDisplacement)3887 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3888                                        bool hasSymbolicDisplacement) {
3889   // Offset should fit into 32 bit immediate field.
3890   if (!isInt<32>(Offset))
3891     return false;
3892 
3893   // If we don't have a symbolic displacement - we don't have any extra
3894   // restrictions.
3895   if (!hasSymbolicDisplacement)
3896     return true;
3897 
3898   // FIXME: Some tweaks might be needed for medium code model.
3899   if (M != CodeModel::Small && M != CodeModel::Kernel)
3900     return false;
3901 
3902   // For small code model we assume that latest object is 16MB before end of 31
3903   // bits boundary. We may also accept pretty large negative constants knowing
3904   // that all objects are in the positive half of address space.
3905   if (M == CodeModel::Small && Offset < 16*1024*1024)
3906     return true;
3907 
3908   // For kernel code model we know that all object resist in the negative half
3909   // of 32bits address space. We may not accept negative offsets, since they may
3910   // be just off and we may accept pretty large positive ones.
3911   if (M == CodeModel::Kernel && Offset >= 0)
3912     return true;
3913 
3914   return false;
3915 }
3916 
3917 /// Determines whether the callee is required to pop its own arguments.
3918 /// Callee pop is necessary to support tail calls.
isCalleePop(CallingConv::ID CallingConv,bool is64Bit,bool IsVarArg,bool GuaranteeTCO)3919 bool X86::isCalleePop(CallingConv::ID CallingConv,
3920                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
3921   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
3922   // can guarantee TCO.
3923   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
3924     return true;
3925 
3926   switch (CallingConv) {
3927   default:
3928     return false;
3929   case CallingConv::X86_StdCall:
3930   case CallingConv::X86_FastCall:
3931   case CallingConv::X86_ThisCall:
3932   case CallingConv::X86_VectorCall:
3933     return !is64Bit;
3934   }
3935 }
3936 
3937 /// \brief Return true if the condition is an unsigned comparison operation.
isX86CCUnsigned(unsigned X86CC)3938 static bool isX86CCUnsigned(unsigned X86CC) {
3939   switch (X86CC) {
3940   default:
3941     llvm_unreachable("Invalid integer condition!");
3942   case X86::COND_E:
3943   case X86::COND_NE:
3944   case X86::COND_B:
3945   case X86::COND_A:
3946   case X86::COND_BE:
3947   case X86::COND_AE:
3948     return true;
3949   case X86::COND_G:
3950   case X86::COND_GE:
3951   case X86::COND_L:
3952   case X86::COND_LE:
3953     return false;
3954   }
3955 }
3956 
TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)3957 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
3958   switch (SetCCOpcode) {
3959   default: llvm_unreachable("Invalid integer condition!");
3960   case ISD::SETEQ:  return X86::COND_E;
3961   case ISD::SETGT:  return X86::COND_G;
3962   case ISD::SETGE:  return X86::COND_GE;
3963   case ISD::SETLT:  return X86::COND_L;
3964   case ISD::SETLE:  return X86::COND_LE;
3965   case ISD::SETNE:  return X86::COND_NE;
3966   case ISD::SETULT: return X86::COND_B;
3967   case ISD::SETUGT: return X86::COND_A;
3968   case ISD::SETULE: return X86::COND_BE;
3969   case ISD::SETUGE: return X86::COND_AE;
3970   }
3971 }
3972 
3973 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3974 /// condition code, returning the condition code and the LHS/RHS of the
3975 /// comparison to make.
TranslateX86CC(ISD::CondCode SetCCOpcode,const SDLoc & DL,bool isFP,SDValue & LHS,SDValue & RHS,SelectionDAG & DAG)3976 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
3977                                bool isFP, SDValue &LHS, SDValue &RHS,
3978                                SelectionDAG &DAG) {
3979   if (!isFP) {
3980     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3981       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3982         // X > -1   -> X == 0, jump !sign.
3983         RHS = DAG.getConstant(0, DL, RHS.getValueType());
3984         return X86::COND_NS;
3985       }
3986       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3987         // X < 0   -> X == 0, jump on sign.
3988         return X86::COND_S;
3989       }
3990       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3991         // X < 1   -> X <= 0
3992         RHS = DAG.getConstant(0, DL, RHS.getValueType());
3993         return X86::COND_LE;
3994       }
3995     }
3996 
3997     return TranslateIntegerX86CC(SetCCOpcode);
3998   }
3999 
4000   // First determine if it is required or is profitable to flip the operands.
4001 
4002   // If LHS is a foldable load, but RHS is not, flip the condition.
4003   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4004       !ISD::isNON_EXTLoad(RHS.getNode())) {
4005     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4006     std::swap(LHS, RHS);
4007   }
4008 
4009   switch (SetCCOpcode) {
4010   default: break;
4011   case ISD::SETOLT:
4012   case ISD::SETOLE:
4013   case ISD::SETUGT:
4014   case ISD::SETUGE:
4015     std::swap(LHS, RHS);
4016     break;
4017   }
4018 
4019   // On a floating point condition, the flags are set as follows:
4020   // ZF  PF  CF   op
4021   //  0 | 0 | 0 | X > Y
4022   //  0 | 0 | 1 | X < Y
4023   //  1 | 0 | 0 | X == Y
4024   //  1 | 1 | 1 | unordered
4025   switch (SetCCOpcode) {
4026   default: llvm_unreachable("Condcode should be pre-legalized away");
4027   case ISD::SETUEQ:
4028   case ISD::SETEQ:   return X86::COND_E;
4029   case ISD::SETOLT:              // flipped
4030   case ISD::SETOGT:
4031   case ISD::SETGT:   return X86::COND_A;
4032   case ISD::SETOLE:              // flipped
4033   case ISD::SETOGE:
4034   case ISD::SETGE:   return X86::COND_AE;
4035   case ISD::SETUGT:              // flipped
4036   case ISD::SETULT:
4037   case ISD::SETLT:   return X86::COND_B;
4038   case ISD::SETUGE:              // flipped
4039   case ISD::SETULE:
4040   case ISD::SETLE:   return X86::COND_BE;
4041   case ISD::SETONE:
4042   case ISD::SETNE:   return X86::COND_NE;
4043   case ISD::SETUO:   return X86::COND_P;
4044   case ISD::SETO:    return X86::COND_NP;
4045   case ISD::SETOEQ:
4046   case ISD::SETUNE:  return X86::COND_INVALID;
4047   }
4048 }
4049 
4050 /// Is there a floating point cmov for the specific X86 condition code?
4051 /// Current x86 isa includes the following FP cmov instructions:
4052 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
hasFPCMov(unsigned X86CC)4053 static bool hasFPCMov(unsigned X86CC) {
4054   switch (X86CC) {
4055   default:
4056     return false;
4057   case X86::COND_B:
4058   case X86::COND_BE:
4059   case X86::COND_E:
4060   case X86::COND_P:
4061   case X86::COND_A:
4062   case X86::COND_AE:
4063   case X86::COND_NE:
4064   case X86::COND_NP:
4065     return true;
4066   }
4067 }
4068 
4069 
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & I,unsigned Intrinsic) const4070 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4071                                            const CallInst &I,
4072                                            unsigned Intrinsic) const {
4073 
4074   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4075   if (!IntrData)
4076     return false;
4077 
4078   Info.opc = ISD::INTRINSIC_W_CHAIN;
4079   Info.readMem = false;
4080   Info.writeMem = false;
4081   Info.vol = false;
4082   Info.offset = 0;
4083 
4084   switch (IntrData->Type) {
4085   case EXPAND_FROM_MEM: {
4086     Info.ptrVal = I.getArgOperand(0);
4087     Info.memVT = MVT::getVT(I.getType());
4088     Info.align = 1;
4089     Info.readMem = true;
4090     break;
4091   }
4092   case COMPRESS_TO_MEM: {
4093     Info.ptrVal = I.getArgOperand(0);
4094     Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4095     Info.align = 1;
4096     Info.writeMem = true;
4097     break;
4098   }
4099   case TRUNCATE_TO_MEM_VI8:
4100   case TRUNCATE_TO_MEM_VI16:
4101   case TRUNCATE_TO_MEM_VI32: {
4102     Info.ptrVal = I.getArgOperand(0);
4103     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
4104     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4105     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4106       ScalarVT = MVT::i8;
4107     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4108       ScalarVT = MVT::i16;
4109     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4110       ScalarVT = MVT::i32;
4111 
4112     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4113     Info.align = 1;
4114     Info.writeMem = true;
4115     break;
4116   }
4117   default:
4118     return false;
4119   }
4120 
4121   return true;
4122 }
4123 
4124 /// Returns true if the target can instruction select the
4125 /// specified FP immediate natively. If false, the legalizer will
4126 /// materialize the FP immediate as a load from a constant pool.
isFPImmLegal(const APFloat & Imm,EVT VT) const4127 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4128   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4129     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4130       return true;
4131   }
4132   return false;
4133 }
4134 
shouldReduceLoadWidth(SDNode * Load,ISD::LoadExtType ExtTy,EVT NewVT) const4135 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4136                                               ISD::LoadExtType ExtTy,
4137                                               EVT NewVT) const {
4138   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4139   // relocation target a movq or addq instruction: don't let the load shrink.
4140   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4141   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4142     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4143       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4144   return true;
4145 }
4146 
4147 /// \brief Returns true if it is beneficial to convert a load of a constant
4148 /// to just the constant itself.
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const4149 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4150                                                           Type *Ty) const {
4151   assert(Ty->isIntegerTy());
4152 
4153   unsigned BitSize = Ty->getPrimitiveSizeInBits();
4154   if (BitSize == 0 || BitSize > 64)
4155     return false;
4156   return true;
4157 }
4158 
isExtractSubvectorCheap(EVT ResVT,unsigned Index) const4159 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4160                                                 unsigned Index) const {
4161   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4162     return false;
4163 
4164   return (Index == 0 || Index == ResVT.getVectorNumElements());
4165 }
4166 
isCheapToSpeculateCttz() const4167 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4168   // Speculate cttz only if we can directly use TZCNT.
4169   return Subtarget.hasBMI();
4170 }
4171 
isCheapToSpeculateCtlz() const4172 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4173   // Speculate ctlz only if we can directly use LZCNT.
4174   return Subtarget.hasLZCNT();
4175 }
4176 
hasAndNotCompare(SDValue Y) const4177 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4178   if (!Subtarget.hasBMI())
4179     return false;
4180 
4181   // There are only 32-bit and 64-bit forms for 'andn'.
4182   EVT VT = Y.getValueType();
4183   if (VT != MVT::i32 && VT != MVT::i64)
4184     return false;
4185 
4186   return true;
4187 }
4188 
4189 /// Return true if every element in Mask, beginning
4190 /// from position Pos and ending in Pos+Size is undef.
isUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size)4191 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4192   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4193     if (0 <= Mask[i])
4194       return false;
4195   return true;
4196 }
4197 
4198 /// Return true if Val is undef or if its value falls within the
4199 /// specified range (L, H].
isUndefOrInRange(int Val,int Low,int Hi)4200 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4201   return (Val < 0) || (Val >= Low && Val < Hi);
4202 }
4203 
4204 /// Return true if every element in Mask is undef or if its value
4205 /// falls within the specified range (L, H].
isUndefOrInRange(ArrayRef<int> Mask,int Low,int Hi)4206 static bool isUndefOrInRange(ArrayRef<int> Mask,
4207                              int Low, int Hi) {
4208   for (int M : Mask)
4209     if (!isUndefOrInRange(M, Low, Hi))
4210       return false;
4211   return true;
4212 }
4213 
4214 /// Val is either less than zero (undef) or equal to the specified value.
isUndefOrEqual(int Val,int CmpVal)4215 static bool isUndefOrEqual(int Val, int CmpVal) {
4216   return (Val < 0 || Val == CmpVal);
4217 }
4218 
4219 /// Val is either the undef or zero sentinel value.
isUndefOrZero(int Val)4220 static bool isUndefOrZero(int Val) {
4221   return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
4222 }
4223 
4224 /// Return true if every element in Mask, beginning
4225 /// from position Pos and ending in Pos+Size, falls within the specified
4226 /// sequential range (Low, Low+Size]. or is undef.
isSequentialOrUndefInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low)4227 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4228                                        unsigned Pos, unsigned Size, int Low) {
4229   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4230     if (!isUndefOrEqual(Mask[i], Low))
4231       return false;
4232   return true;
4233 }
4234 
4235 /// Return true if every element in Mask, beginning
4236 /// from position Pos and ending in Pos+Size, falls within the specified
4237 /// sequential range (Low, Low+Size], or is undef or is zero.
isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask,unsigned Pos,unsigned Size,int Low)4238 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4239                                              unsigned Size, int Low) {
4240   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4241     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4242       return false;
4243   return true;
4244 }
4245 
4246 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4247 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
isVEXTRACTIndex(SDNode * N,unsigned vecWidth)4248 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4249   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4250   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4251     return false;
4252 
4253   // The index should be aligned on a vecWidth-bit boundary.
4254   uint64_t Index =
4255     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4256 
4257   MVT VT = N->getSimpleValueType(0);
4258   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4259   bool Result = (Index * ElSize) % vecWidth == 0;
4260 
4261   return Result;
4262 }
4263 
4264 /// Return true if the specified INSERT_SUBVECTOR
4265 /// operand specifies a subvector insert that is suitable for input to
4266 /// insertion of 128 or 256-bit subvectors
isVINSERTIndex(SDNode * N,unsigned vecWidth)4267 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4268   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4269   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4270     return false;
4271   // The index should be aligned on a vecWidth-bit boundary.
4272   uint64_t Index =
4273     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4274 
4275   MVT VT = N->getSimpleValueType(0);
4276   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4277   bool Result = (Index * ElSize) % vecWidth == 0;
4278 
4279   return Result;
4280 }
4281 
isVINSERT128Index(SDNode * N)4282 bool X86::isVINSERT128Index(SDNode *N) {
4283   return isVINSERTIndex(N, 128);
4284 }
4285 
isVINSERT256Index(SDNode * N)4286 bool X86::isVINSERT256Index(SDNode *N) {
4287   return isVINSERTIndex(N, 256);
4288 }
4289 
isVEXTRACT128Index(SDNode * N)4290 bool X86::isVEXTRACT128Index(SDNode *N) {
4291   return isVEXTRACTIndex(N, 128);
4292 }
4293 
isVEXTRACT256Index(SDNode * N)4294 bool X86::isVEXTRACT256Index(SDNode *N) {
4295   return isVEXTRACTIndex(N, 256);
4296 }
4297 
getExtractVEXTRACTImmediate(SDNode * N,unsigned vecWidth)4298 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4299   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4300   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4301          "Illegal extract subvector for VEXTRACT");
4302 
4303   uint64_t Index =
4304     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4305 
4306   MVT VecVT = N->getOperand(0).getSimpleValueType();
4307   MVT ElVT = VecVT.getVectorElementType();
4308 
4309   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4310   return Index / NumElemsPerChunk;
4311 }
4312 
getInsertVINSERTImmediate(SDNode * N,unsigned vecWidth)4313 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4314   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4315   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4316          "Illegal insert subvector for VINSERT");
4317 
4318   uint64_t Index =
4319     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4320 
4321   MVT VecVT = N->getSimpleValueType(0);
4322   MVT ElVT = VecVT.getVectorElementType();
4323 
4324   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4325   return Index / NumElemsPerChunk;
4326 }
4327 
4328 /// Return the appropriate immediate to extract the specified
4329 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
getExtractVEXTRACT128Immediate(SDNode * N)4330 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4331   return getExtractVEXTRACTImmediate(N, 128);
4332 }
4333 
4334 /// Return the appropriate immediate to extract the specified
4335 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
getExtractVEXTRACT256Immediate(SDNode * N)4336 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4337   return getExtractVEXTRACTImmediate(N, 256);
4338 }
4339 
4340 /// Return the appropriate immediate to insert at the specified
4341 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
getInsertVINSERT128Immediate(SDNode * N)4342 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4343   return getInsertVINSERTImmediate(N, 128);
4344 }
4345 
4346 /// Return the appropriate immediate to insert at the specified
4347 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
getInsertVINSERT256Immediate(SDNode * N)4348 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4349   return getInsertVINSERTImmediate(N, 256);
4350 }
4351 
4352 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
isZeroNode(SDValue Elt)4353 bool X86::isZeroNode(SDValue Elt) {
4354   return isNullConstant(Elt) || isNullFPConstant(Elt);
4355 }
4356 
4357 // Build a vector of constants
4358 // Use an UNDEF node if MaskElt == -1.
4359 // Spilt 64-bit constants in the 32-bit mode.
getConstVector(ArrayRef<int> Values,MVT VT,SelectionDAG & DAG,const SDLoc & dl,bool IsMask=false)4360 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4361                               const SDLoc &dl, bool IsMask = false) {
4362 
4363   SmallVector<SDValue, 32>  Ops;
4364   bool Split = false;
4365 
4366   MVT ConstVecVT = VT;
4367   unsigned NumElts = VT.getVectorNumElements();
4368   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4369   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4370     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4371     Split = true;
4372   }
4373 
4374   MVT EltVT = ConstVecVT.getVectorElementType();
4375   for (unsigned i = 0; i < NumElts; ++i) {
4376     bool IsUndef = Values[i] < 0 && IsMask;
4377     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4378       DAG.getConstant(Values[i], dl, EltVT);
4379     Ops.push_back(OpNode);
4380     if (Split)
4381       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4382                     DAG.getConstant(0, dl, EltVT));
4383   }
4384   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4385   if (Split)
4386     ConstsNode = DAG.getBitcast(VT, ConstsNode);
4387   return ConstsNode;
4388 }
4389 
4390 /// Returns a vector of specified type with all zero elements.
getZeroVector(MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)4391 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4392                              SelectionDAG &DAG, const SDLoc &dl) {
4393   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4394           VT.getVectorElementType() == MVT::i1) &&
4395          "Unexpected vector type");
4396 
4397   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4398   // type. This ensures they get CSE'd. But if the integer type is not
4399   // available, use a floating-point +0.0 instead.
4400   SDValue Vec;
4401   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4402     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4403   } else if (VT.getVectorElementType() == MVT::i1) {
4404     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4405            "Unexpected vector type");
4406     assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4407            "Unexpected vector type");
4408     Vec = DAG.getConstant(0, dl, VT);
4409   } else {
4410     unsigned Num32BitElts = VT.getSizeInBits() / 32;
4411     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4412   }
4413   return DAG.getBitcast(VT, Vec);
4414 }
4415 
extractSubVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)4416 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4417                                 const SDLoc &dl, unsigned vectorWidth) {
4418   assert((vectorWidth == 128 || vectorWidth == 256) &&
4419          "Unsupported vector width");
4420   EVT VT = Vec.getValueType();
4421   EVT ElVT = VT.getVectorElementType();
4422   unsigned Factor = VT.getSizeInBits()/vectorWidth;
4423   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4424                                   VT.getVectorNumElements()/Factor);
4425 
4426   // Extract from UNDEF is UNDEF.
4427   if (Vec.isUndef())
4428     return DAG.getUNDEF(ResultVT);
4429 
4430   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
4431   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4432   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4433 
4434   // This is the index of the first element of the vectorWidth-bit chunk
4435   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4436   IdxVal &= ~(ElemsPerChunk - 1);
4437 
4438   // If the input is a buildvector just emit a smaller one.
4439   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4440     return DAG.getNode(ISD::BUILD_VECTOR,
4441          dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4442 
4443   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4444   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4445 }
4446 
4447 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
4448 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4449 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4450 /// instructions or a simple subregister reference. Idx is an index in the
4451 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
4452 /// lowering EXTRACT_VECTOR_ELT operations easier.
extract128BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4453 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4454                                    SelectionDAG &DAG, const SDLoc &dl) {
4455   assert((Vec.getValueType().is256BitVector() ||
4456           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4457   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4458 }
4459 
4460 /// Generate a DAG to grab 256-bits from a 512-bit vector.
extract256BitVector(SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4461 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4462                                    SelectionDAG &DAG, const SDLoc &dl) {
4463   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4464   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4465 }
4466 
insertSubVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl,unsigned vectorWidth)4467 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4468                                SelectionDAG &DAG, const SDLoc &dl,
4469                                unsigned vectorWidth) {
4470   assert((vectorWidth == 128 || vectorWidth == 256) &&
4471          "Unsupported vector width");
4472   // Inserting UNDEF is Result
4473   if (Vec.isUndef())
4474     return Result;
4475   EVT VT = Vec.getValueType();
4476   EVT ElVT = VT.getVectorElementType();
4477   EVT ResultVT = Result.getValueType();
4478 
4479   // Insert the relevant vectorWidth bits.
4480   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4481   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4482 
4483   // This is the index of the first element of the vectorWidth-bit chunk
4484   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4485   IdxVal &= ~(ElemsPerChunk - 1);
4486 
4487   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4488   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4489 }
4490 
4491 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
4492 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4493 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4494 /// simple superregister reference.  Idx is an index in the 128 bits
4495 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
4496 /// lowering INSERT_VECTOR_ELT operations easier.
insert128BitVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4497 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4498                                   SelectionDAG &DAG, const SDLoc &dl) {
4499   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4500 
4501   // For insertion into the zero index (low half) of a 256-bit vector, it is
4502   // more efficient to generate a blend with immediate instead of an insert*128.
4503   // We are still creating an INSERT_SUBVECTOR below with an undef node to
4504   // extend the subvector to the size of the result vector. Make sure that
4505   // we are not recursing on that node by checking for undef here.
4506   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4507       !Result.isUndef()) {
4508     EVT ResultVT = Result.getValueType();
4509     SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4510     SDValue Undef = DAG.getUNDEF(ResultVT);
4511     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4512                                  Vec, ZeroIndex);
4513 
4514     // The blend instruction, and therefore its mask, depend on the data type.
4515     MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4516     if (ScalarType.isFloatingPoint()) {
4517       // Choose either vblendps (float) or vblendpd (double).
4518       unsigned ScalarSize = ScalarType.getSizeInBits();
4519       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4520       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4521       SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4522       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4523     }
4524 
4525     const X86Subtarget &Subtarget =
4526     static_cast<const X86Subtarget &>(DAG.getSubtarget());
4527 
4528     // AVX2 is needed for 256-bit integer blend support.
4529     // Integers must be cast to 32-bit because there is only vpblendd;
4530     // vpblendw can't be used for this because it has a handicapped mask.
4531 
4532     // If we don't have AVX2, then cast to float. Using a wrong domain blend
4533     // is still more efficient than using the wrong domain vinsertf128 that
4534     // will be created by InsertSubVector().
4535     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4536 
4537     SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4538     Result = DAG.getBitcast(CastVT, Result);
4539     Vec256 = DAG.getBitcast(CastVT, Vec256);
4540     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4541     return DAG.getBitcast(ResultVT, Vec256);
4542   }
4543 
4544   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4545 }
4546 
insert256BitVector(SDValue Result,SDValue Vec,unsigned IdxVal,SelectionDAG & DAG,const SDLoc & dl)4547 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4548                                   SelectionDAG &DAG, const SDLoc &dl) {
4549   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4550   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4551 }
4552 
4553 /// Insert i1-subvector to i1-vector.
insert1BitVector(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)4554 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4555                                 const X86Subtarget &Subtarget) {
4556 
4557   SDLoc dl(Op);
4558   SDValue Vec = Op.getOperand(0);
4559   SDValue SubVec = Op.getOperand(1);
4560   SDValue Idx = Op.getOperand(2);
4561 
4562   if (!isa<ConstantSDNode>(Idx))
4563     return SDValue();
4564 
4565   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4566   if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
4567     return Op;
4568 
4569   MVT OpVT = Op.getSimpleValueType();
4570   MVT SubVecVT = SubVec.getSimpleValueType();
4571   unsigned NumElems = OpVT.getVectorNumElements();
4572   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4573 
4574   assert(IdxVal + SubVecNumElems <= NumElems &&
4575          IdxVal % SubVecVT.getSizeInBits() == 0 &&
4576          "Unexpected index value in INSERT_SUBVECTOR");
4577 
4578   // There are 3 possible cases:
4579   // 1. Subvector should be inserted in the lower part (IdxVal == 0)
4580   // 2. Subvector should be inserted in the upper part
4581   //    (IdxVal + SubVecNumElems == NumElems)
4582   // 3. Subvector should be inserted in the middle (for example v2i1
4583   //    to v16i1, index 2)
4584 
4585   // extend to natively supported kshift
4586   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4587   MVT WideOpVT = OpVT;
4588   if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
4589     WideOpVT = MinVT;
4590 
4591   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4592   SDValue Undef = DAG.getUNDEF(WideOpVT);
4593   SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4594                                    Undef, SubVec, ZeroIdx);
4595 
4596   // Extract sub-vector if require.
4597   auto ExtractSubVec = [&](SDValue V) {
4598     return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
4599                                                 OpVT, V, ZeroIdx);
4600   };
4601 
4602   if (Vec.isUndef()) {
4603     if (IdxVal != 0) {
4604       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
4605       WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
4606     }
4607     return ExtractSubVec(WideSubVec);
4608   }
4609 
4610   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
4611     NumElems = WideOpVT.getVectorNumElements();
4612     unsigned ShiftLeft = NumElems - SubVecNumElems;
4613     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4614     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4615                              DAG.getConstant(ShiftLeft, dl, MVT::i8));
4616     Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
4617       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
4618     return ExtractSubVec(Vec);
4619   }
4620 
4621   if (IdxVal == 0) {
4622     // Zero lower bits of the Vec
4623     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4624     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4625     Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4626     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4627     // Merge them together, SubVec should be zero extended.
4628     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4629                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
4630                              SubVec, ZeroIdx);
4631     Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4632     return ExtractSubVec(Vec);
4633   }
4634 
4635   // Simple case when we put subvector in the upper part
4636   if (IdxVal + SubVecNumElems == NumElems) {
4637     // Zero upper bits of the Vec
4638     WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4639                              DAG.getConstant(IdxVal, dl, MVT::i8));
4640     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4641     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4642     Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4643     Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4644     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4645     return ExtractSubVec(Vec);
4646   }
4647   // Subvector should be inserted in the middle - use shuffle
4648   WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
4649                            SubVec, ZeroIdx);
4650   SmallVector<int, 64> Mask;
4651   for (unsigned i = 0; i < NumElems; ++i)
4652     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
4653                     i : i + NumElems);
4654   return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
4655 }
4656 
4657 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
4658 /// instructions. This is used because creating CONCAT_VECTOR nodes of
4659 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
4660 /// large BUILD_VECTORS.
concat128BitVectors(SDValue V1,SDValue V2,EVT VT,unsigned NumElems,SelectionDAG & DAG,const SDLoc & dl)4661 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
4662                                    unsigned NumElems, SelectionDAG &DAG,
4663                                    const SDLoc &dl) {
4664   SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4665   return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
4666 }
4667 
concat256BitVectors(SDValue V1,SDValue V2,EVT VT,unsigned NumElems,SelectionDAG & DAG,const SDLoc & dl)4668 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
4669                                    unsigned NumElems, SelectionDAG &DAG,
4670                                    const SDLoc &dl) {
4671   SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4672   return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
4673 }
4674 
4675 /// Returns a vector of specified type with all bits set.
4676 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4677 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
4678 /// Then bitcast to their original type, ensuring they get CSE'd.
getOnesVector(EVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)4679 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
4680                              SelectionDAG &DAG, const SDLoc &dl) {
4681   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4682          "Expected a 128/256/512-bit vector type");
4683 
4684   APInt Ones = APInt::getAllOnesValue(32);
4685   unsigned NumElts = VT.getSizeInBits() / 32;
4686   SDValue Vec;
4687   if (!Subtarget.hasInt256() && NumElts == 8) {
4688     Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
4689     Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4690   } else {
4691     Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4692   }
4693   return DAG.getBitcast(VT, Vec);
4694 }
4695 
4696 /// Returns a vector_shuffle node for an unpackl operation.
getUnpackl(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)4697 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4698                           SDValue V1, SDValue V2) {
4699   assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4700   unsigned NumElems = VT.getVectorNumElements();
4701   SmallVector<int, 8> Mask(NumElems);
4702   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4703     Mask[i * 2]     = i;
4704     Mask[i * 2 + 1] = i + NumElems;
4705   }
4706   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4707 }
4708 
4709 /// Returns a vector_shuffle node for an unpackh operation.
getUnpackh(SelectionDAG & DAG,const SDLoc & dl,MVT VT,SDValue V1,SDValue V2)4710 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4711                           SDValue V1, SDValue V2) {
4712   assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4713   unsigned NumElems = VT.getVectorNumElements();
4714   SmallVector<int, 8> Mask(NumElems);
4715   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4716     Mask[i * 2]     = i + Half;
4717     Mask[i * 2 + 1] = i + NumElems + Half;
4718   }
4719   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4720 }
4721 
4722 /// Return a vector_shuffle of the specified vector of zero or undef vector.
4723 /// This produces a shuffle where the low element of V2 is swizzled into the
4724 /// zero/undef vector, landing at element Idx.
4725 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
getShuffleVectorZeroOrUndef(SDValue V2,int Idx,bool IsZero,const X86Subtarget & Subtarget,SelectionDAG & DAG)4726 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
4727                                            bool IsZero,
4728                                            const X86Subtarget &Subtarget,
4729                                            SelectionDAG &DAG) {
4730   MVT VT = V2.getSimpleValueType();
4731   SDValue V1 = IsZero
4732     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4733   int NumElems = VT.getVectorNumElements();
4734   SmallVector<int, 16> MaskVec(NumElems);
4735   for (int i = 0; i != NumElems; ++i)
4736     // If this is the insertion idx, put the low elt of V2 here.
4737     MaskVec[i] = (i == Idx) ? NumElems : i;
4738   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4739 }
4740 
peekThroughBitcasts(SDValue V)4741 static SDValue peekThroughBitcasts(SDValue V) {
4742   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
4743     V = V.getOperand(0);
4744   return V;
4745 }
4746 
getTargetShuffleMaskIndices(SDValue MaskNode,unsigned MaskEltSizeInBits,SmallVectorImpl<uint64_t> & RawMask)4747 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
4748                                         unsigned MaskEltSizeInBits,
4749                                         SmallVectorImpl<uint64_t> &RawMask) {
4750   MaskNode = peekThroughBitcasts(MaskNode);
4751 
4752   MVT VT = MaskNode.getSimpleValueType();
4753   assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
4754 
4755   // Split an APInt element into MaskEltSizeInBits sized pieces and
4756   // insert into the shuffle mask.
4757   auto SplitElementToMask = [&](APInt Element) {
4758     // Note that this is x86 and so always little endian: the low byte is
4759     // the first byte of the mask.
4760     int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4761     for (int i = 0; i < Split; ++i) {
4762       APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
4763       Element = Element.lshr(MaskEltSizeInBits);
4764       RawMask.push_back(RawElt.getZExtValue());
4765     }
4766   };
4767 
4768   if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
4769     // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4770     // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
4771     if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
4772       return false;
4773     if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
4774       const APInt &MaskElement = CN->getAPIntValue();
4775       for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
4776         APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
4777         RawMask.push_back(RawElt.getZExtValue());
4778       }
4779     }
4780     return false;
4781   }
4782 
4783   if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
4784       MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
4785 
4786     // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4787     if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4788       return false;
4789     unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4790 
4791     SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
4792     if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
4793       SplitElementToMask(CN->getAPIntValue());
4794       RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
4795       return true;
4796     }
4797     return false;
4798   }
4799 
4800   if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
4801     return false;
4802 
4803   // We can always decode if the buildvector is all zero constants,
4804   // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
4805   if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
4806     RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
4807     return true;
4808   }
4809 
4810   // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4811   if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4812     return false;
4813 
4814   for (SDValue Op : MaskNode->ops()) {
4815     if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
4816       SplitElementToMask(CN->getAPIntValue());
4817     else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
4818       SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
4819     else
4820       return false;
4821   }
4822 
4823   return true;
4824 }
4825 
getTargetShuffleMaskConstant(SDValue MaskNode)4826 static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
4827   MaskNode = peekThroughBitcasts(MaskNode);
4828 
4829   auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
4830   if (!MaskLoad)
4831     return nullptr;
4832 
4833   SDValue Ptr = MaskLoad->getBasePtr();
4834   if (Ptr->getOpcode() == X86ISD::Wrapper ||
4835       Ptr->getOpcode() == X86ISD::WrapperRIP)
4836     Ptr = Ptr->getOperand(0);
4837 
4838   auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
4839   if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
4840     return nullptr;
4841 
4842   return dyn_cast<Constant>(MaskCP->getConstVal());
4843 }
4844 
4845 /// Calculates the shuffle mask corresponding to the target-specific opcode.
4846 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
4847 /// operands in \p Ops, and returns true.
4848 /// Sets \p IsUnary to true if only one source is used. Note that this will set
4849 /// IsUnary for shuffles which use a single input multiple times, and in those
4850 /// cases it will adjust the mask to only have indices within that single input.
4851 /// It is an error to call this with non-empty Mask/Ops vectors.
getTargetShuffleMask(SDNode * N,MVT VT,bool AllowSentinelZero,SmallVectorImpl<SDValue> & Ops,SmallVectorImpl<int> & Mask,bool & IsUnary)4852 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
4853                                  SmallVectorImpl<SDValue> &Ops,
4854                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
4855   unsigned NumElems = VT.getVectorNumElements();
4856   SDValue ImmN;
4857 
4858   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
4859   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
4860 
4861   IsUnary = false;
4862   bool IsFakeUnary = false;
4863   switch(N->getOpcode()) {
4864   case X86ISD::BLENDI:
4865     ImmN = N->getOperand(N->getNumOperands()-1);
4866     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4867     break;
4868   case X86ISD::SHUFP:
4869     ImmN = N->getOperand(N->getNumOperands()-1);
4870     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4871     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4872     break;
4873   case X86ISD::INSERTPS:
4874     ImmN = N->getOperand(N->getNumOperands()-1);
4875     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4876     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4877     break;
4878   case X86ISD::UNPCKH:
4879     DecodeUNPCKHMask(VT, Mask);
4880     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4881     break;
4882   case X86ISD::UNPCKL:
4883     DecodeUNPCKLMask(VT, Mask);
4884     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4885     break;
4886   case X86ISD::MOVHLPS:
4887     DecodeMOVHLPSMask(NumElems, Mask);
4888     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4889     break;
4890   case X86ISD::MOVLHPS:
4891     DecodeMOVLHPSMask(NumElems, Mask);
4892     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4893     break;
4894   case X86ISD::PALIGNR:
4895     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4896     ImmN = N->getOperand(N->getNumOperands()-1);
4897     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4898     break;
4899   case X86ISD::VSHLDQ:
4900     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4901     ImmN = N->getOperand(N->getNumOperands() - 1);
4902     DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4903     IsUnary = true;
4904     break;
4905   case X86ISD::VSRLDQ:
4906     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4907     ImmN = N->getOperand(N->getNumOperands() - 1);
4908     DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4909     IsUnary = true;
4910     break;
4911   case X86ISD::PSHUFD:
4912   case X86ISD::VPERMILPI:
4913     ImmN = N->getOperand(N->getNumOperands()-1);
4914     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4915     IsUnary = true;
4916     break;
4917   case X86ISD::PSHUFHW:
4918     ImmN = N->getOperand(N->getNumOperands()-1);
4919     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4920     IsUnary = true;
4921     break;
4922   case X86ISD::PSHUFLW:
4923     ImmN = N->getOperand(N->getNumOperands()-1);
4924     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4925     IsUnary = true;
4926     break;
4927   case X86ISD::VZEXT_MOVL:
4928     DecodeZeroMoveLowMask(VT, Mask);
4929     IsUnary = true;
4930     break;
4931   case X86ISD::VPERMILPV: {
4932     IsUnary = true;
4933     SDValue MaskNode = N->getOperand(1);
4934     unsigned MaskEltSize = VT.getScalarSizeInBits();
4935     SmallVector<uint64_t, 32> RawMask;
4936     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
4937       DecodeVPERMILPMask(VT, RawMask, Mask);
4938       break;
4939     }
4940     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4941       DecodeVPERMILPMask(C, MaskEltSize, Mask);
4942       break;
4943     }
4944     return false;
4945   }
4946   case X86ISD::PSHUFB: {
4947     IsUnary = true;
4948     SDValue MaskNode = N->getOperand(1);
4949     SmallVector<uint64_t, 32> RawMask;
4950     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
4951       DecodePSHUFBMask(RawMask, Mask);
4952       break;
4953     }
4954     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4955       DecodePSHUFBMask(C, Mask);
4956       break;
4957     }
4958     return false;
4959   }
4960   case X86ISD::VPERMI:
4961     ImmN = N->getOperand(N->getNumOperands()-1);
4962     DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4963     IsUnary = true;
4964     break;
4965   case X86ISD::MOVSS:
4966   case X86ISD::MOVSD:
4967     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
4968     break;
4969   case X86ISD::VPERM2X128:
4970     ImmN = N->getOperand(N->getNumOperands()-1);
4971     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4972     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4973     break;
4974   case X86ISD::MOVSLDUP:
4975     DecodeMOVSLDUPMask(VT, Mask);
4976     IsUnary = true;
4977     break;
4978   case X86ISD::MOVSHDUP:
4979     DecodeMOVSHDUPMask(VT, Mask);
4980     IsUnary = true;
4981     break;
4982   case X86ISD::MOVDDUP:
4983     DecodeMOVDDUPMask(VT, Mask);
4984     IsUnary = true;
4985     break;
4986   case X86ISD::MOVLHPD:
4987   case X86ISD::MOVLPD:
4988   case X86ISD::MOVLPS:
4989     // Not yet implemented
4990     return false;
4991   case X86ISD::VPERMIL2: {
4992     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4993     unsigned MaskEltSize = VT.getScalarSizeInBits();
4994     SDValue MaskNode = N->getOperand(2);
4995     SDValue CtrlNode = N->getOperand(3);
4996     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
4997       unsigned CtrlImm = CtrlOp->getZExtValue();
4998       SmallVector<uint64_t, 32> RawMask;
4999       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5000         DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5001         break;
5002       }
5003       if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5004         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5005         break;
5006       }
5007     }
5008     return false;
5009   }
5010   case X86ISD::VPPERM: {
5011     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5012     SDValue MaskNode = N->getOperand(2);
5013     SmallVector<uint64_t, 32> RawMask;
5014     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5015       DecodeVPPERMMask(RawMask, Mask);
5016       break;
5017     }
5018     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5019       DecodeVPPERMMask(C, Mask);
5020       break;
5021     }
5022     return false;
5023   }
5024   case X86ISD::VPERMV: {
5025     IsUnary = true;
5026     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5027     Ops.push_back(N->getOperand(1));
5028     SDValue MaskNode = N->getOperand(0);
5029     SmallVector<uint64_t, 32> RawMask;
5030     unsigned MaskEltSize = VT.getScalarSizeInBits();
5031     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5032       DecodeVPERMVMask(RawMask, Mask);
5033       break;
5034     }
5035     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5036       DecodeVPERMVMask(C, VT, Mask);
5037       break;
5038     }
5039     return false;
5040   }
5041   case X86ISD::VPERMV3: {
5042     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5043     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5044     Ops.push_back(N->getOperand(0));
5045     Ops.push_back(N->getOperand(2));
5046     SDValue MaskNode = N->getOperand(1);
5047     if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5048       DecodeVPERMV3Mask(C, VT, Mask);
5049       break;
5050     }
5051     return false;
5052   }
5053   default: llvm_unreachable("unknown target shuffle node");
5054   }
5055 
5056   // Empty mask indicates the decode failed.
5057   if (Mask.empty())
5058     return false;
5059 
5060   // Check if we're getting a shuffle mask with zero'd elements.
5061   if (!AllowSentinelZero)
5062     if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5063       return false;
5064 
5065   // If we have a fake unary shuffle, the shuffle mask is spread across two
5066   // inputs that are actually the same node. Re-map the mask to always point
5067   // into the first input.
5068   if (IsFakeUnary)
5069     for (int &M : Mask)
5070       if (M >= (int)Mask.size())
5071         M -= Mask.size();
5072 
5073   // If we didn't already add operands in the opcode-specific code, default to
5074   // adding 1 or 2 operands starting at 0.
5075   if (Ops.empty()) {
5076     Ops.push_back(N->getOperand(0));
5077     if (!IsUnary || IsFakeUnary)
5078       Ops.push_back(N->getOperand(1));
5079   }
5080 
5081   return true;
5082 }
5083 
5084 /// Check a target shuffle mask's inputs to see if we can set any values to
5085 /// SM_SentinelZero - this is for elements that are known to be zero
5086 /// (not just zeroable) from their inputs.
5087 /// Returns true if the target shuffle mask was decoded.
setTargetShuffleZeroElements(SDValue N,SmallVectorImpl<int> & Mask,SmallVectorImpl<SDValue> & Ops)5088 static bool setTargetShuffleZeroElements(SDValue N,
5089                                          SmallVectorImpl<int> &Mask,
5090                                          SmallVectorImpl<SDValue> &Ops) {
5091   bool IsUnary;
5092   if (!isTargetShuffle(N.getOpcode()))
5093     return false;
5094   if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
5095                             Mask, IsUnary))
5096     return false;
5097 
5098   SDValue V1 = Ops[0];
5099   SDValue V2 = IsUnary ? V1 : Ops[1];
5100 
5101   V1 = peekThroughBitcasts(V1);
5102   V2 = peekThroughBitcasts(V2);
5103 
5104   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5105     int M = Mask[i];
5106 
5107     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5108     if (M < 0)
5109       continue;
5110 
5111     // Determine shuffle input and normalize the mask.
5112     SDValue V = M < Size ? V1 : V2;
5113     M %= Size;
5114 
5115     // We are referencing an UNDEF input.
5116     if (V.isUndef()) {
5117       Mask[i] = SM_SentinelUndef;
5118       continue;
5119     }
5120 
5121     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5122     if (V.getOpcode() != ISD::BUILD_VECTOR)
5123       continue;
5124 
5125     // If the BUILD_VECTOR has fewer elements then the (larger) source
5126     // element must be UNDEF/ZERO.
5127     // TODO: Is it worth testing the individual bits of a constant?
5128     if ((Size % V.getNumOperands()) == 0) {
5129       int Scale = Size / V->getNumOperands();
5130       SDValue Op = V.getOperand(M / Scale);
5131       if (Op.isUndef())
5132         Mask[i] = SM_SentinelUndef;
5133       else if (X86::isZeroNode(Op))
5134         Mask[i] = SM_SentinelZero;
5135       continue;
5136     }
5137 
5138     // If the BUILD_VECTOR has more elements then all the (smaller) source
5139     // elements must be all UNDEF or all ZERO.
5140     if ((V.getNumOperands() % Size) == 0) {
5141       int Scale = V->getNumOperands() / Size;
5142       bool AllUndef = true;
5143       bool AllZero = true;
5144       for (int j = 0; j < Scale; ++j) {
5145         SDValue Op = V.getOperand((M * Scale) + j);
5146         AllUndef &= Op.isUndef();
5147         AllZero &= X86::isZeroNode(Op);
5148       }
5149       if (AllUndef)
5150         Mask[i] = SM_SentinelUndef;
5151       else if (AllZero)
5152         Mask[i] = SM_SentinelZero;
5153       continue;
5154     }
5155   }
5156 
5157   return true;
5158 }
5159 
5160 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5161 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5162 /// remaining input indices in case we now have a unary shuffle and adjust the
5163 /// Op0/Op1 inputs accordingly.
5164 /// Returns true if the target shuffle mask was decoded.
resolveTargetShuffleInputs(SDValue Op,SDValue & Op0,SDValue & Op1,SmallVectorImpl<int> & Mask)5165 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5166                                        SmallVectorImpl<int> &Mask) {
5167   SmallVector<SDValue, 2> Ops;
5168   if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5169     return false;
5170 
5171   int NumElts = Mask.size();
5172   bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
5173     return 0 <= Idx && Idx < NumElts;
5174   });
5175   bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
5176                               [NumElts](int Idx) { return NumElts <= Idx; });
5177 
5178   Op0 = Op0InUse ? Ops[0] : SDValue();
5179   Op1 = Op1InUse ? Ops[1] : SDValue();
5180 
5181   // We're only using Op1 - commute the mask and inputs.
5182   if (!Op0InUse && Op1InUse) {
5183     for (int &M : Mask)
5184       if (NumElts <= M)
5185         M -= NumElts;
5186     Op0 = Op1;
5187     Op1 = SDValue();
5188   }
5189 
5190   return true;
5191 }
5192 
5193 /// Returns the scalar element that will make up the ith
5194 /// element of the result of the vector shuffle.
getShuffleScalarElt(SDNode * N,unsigned Index,SelectionDAG & DAG,unsigned Depth)5195 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5196                                    unsigned Depth) {
5197   if (Depth == 6)
5198     return SDValue();  // Limit search depth.
5199 
5200   SDValue V = SDValue(N, 0);
5201   EVT VT = V.getValueType();
5202   unsigned Opcode = V.getOpcode();
5203 
5204   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5205   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5206     int Elt = SV->getMaskElt(Index);
5207 
5208     if (Elt < 0)
5209       return DAG.getUNDEF(VT.getVectorElementType());
5210 
5211     unsigned NumElems = VT.getVectorNumElements();
5212     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5213                                          : SV->getOperand(1);
5214     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5215   }
5216 
5217   // Recurse into target specific vector shuffles to find scalars.
5218   if (isTargetShuffle(Opcode)) {
5219     MVT ShufVT = V.getSimpleValueType();
5220     MVT ShufSVT = ShufVT.getVectorElementType();
5221     int NumElems = (int)ShufVT.getVectorNumElements();
5222     SmallVector<int, 16> ShuffleMask;
5223     SmallVector<SDValue, 16> ShuffleOps;
5224     bool IsUnary;
5225 
5226     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5227       return SDValue();
5228 
5229     int Elt = ShuffleMask[Index];
5230     if (Elt == SM_SentinelZero)
5231       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5232                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5233     if (Elt == SM_SentinelUndef)
5234       return DAG.getUNDEF(ShufSVT);
5235 
5236     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5237     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5238     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5239                                Depth+1);
5240   }
5241 
5242   // Actual nodes that may contain scalar elements
5243   if (Opcode == ISD::BITCAST) {
5244     V = V.getOperand(0);
5245     EVT SrcVT = V.getValueType();
5246     unsigned NumElems = VT.getVectorNumElements();
5247 
5248     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5249       return SDValue();
5250   }
5251 
5252   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5253     return (Index == 0) ? V.getOperand(0)
5254                         : DAG.getUNDEF(VT.getVectorElementType());
5255 
5256   if (V.getOpcode() == ISD::BUILD_VECTOR)
5257     return V.getOperand(Index);
5258 
5259   return SDValue();
5260 }
5261 
5262 /// Custom lower build_vector of v16i8.
LowerBuildVectorv16i8(SDValue Op,unsigned NonZeros,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget,const TargetLowering & TLI)5263 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5264                                        unsigned NumNonZero, unsigned NumZero,
5265                                        SelectionDAG &DAG,
5266                                        const X86Subtarget &Subtarget,
5267                                        const TargetLowering &TLI) {
5268   if (NumNonZero > 8)
5269     return SDValue();
5270 
5271   SDLoc dl(Op);
5272   SDValue V;
5273   bool First = true;
5274 
5275   // SSE4.1 - use PINSRB to insert each byte directly.
5276   if (Subtarget.hasSSE41()) {
5277     for (unsigned i = 0; i < 16; ++i) {
5278       bool isNonZero = (NonZeros & (1 << i)) != 0;
5279       if (isNonZero) {
5280         if (First) {
5281           if (NumZero)
5282             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5283           else
5284             V = DAG.getUNDEF(MVT::v16i8);
5285           First = false;
5286         }
5287         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5288                         MVT::v16i8, V, Op.getOperand(i),
5289                         DAG.getIntPtrConstant(i, dl));
5290       }
5291     }
5292 
5293     return V;
5294   }
5295 
5296   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5297   for (unsigned i = 0; i < 16; ++i) {
5298     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5299     if (ThisIsNonZero && First) {
5300       if (NumZero)
5301         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5302       else
5303         V = DAG.getUNDEF(MVT::v8i16);
5304       First = false;
5305     }
5306 
5307     if ((i & 1) != 0) {
5308       SDValue ThisElt, LastElt;
5309       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5310       if (LastIsNonZero) {
5311         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5312                               MVT::i16, Op.getOperand(i-1));
5313       }
5314       if (ThisIsNonZero) {
5315         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5316         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5317                               ThisElt, DAG.getConstant(8, dl, MVT::i8));
5318         if (LastIsNonZero)
5319           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5320       } else
5321         ThisElt = LastElt;
5322 
5323       if (ThisElt.getNode())
5324         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5325                         DAG.getIntPtrConstant(i/2, dl));
5326     }
5327   }
5328 
5329   return DAG.getBitcast(MVT::v16i8, V);
5330 }
5331 
5332 /// Custom lower build_vector of v8i16.
LowerBuildVectorv8i16(SDValue Op,unsigned NonZeros,unsigned NumNonZero,unsigned NumZero,SelectionDAG & DAG,const X86Subtarget & Subtarget,const TargetLowering & TLI)5333 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5334                                      unsigned NumNonZero, unsigned NumZero,
5335                                      SelectionDAG &DAG,
5336                                      const X86Subtarget &Subtarget,
5337                                      const TargetLowering &TLI) {
5338   if (NumNonZero > 4)
5339     return SDValue();
5340 
5341   SDLoc dl(Op);
5342   SDValue V;
5343   bool First = true;
5344   for (unsigned i = 0; i < 8; ++i) {
5345     bool isNonZero = (NonZeros & (1 << i)) != 0;
5346     if (isNonZero) {
5347       if (First) {
5348         if (NumZero)
5349           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5350         else
5351           V = DAG.getUNDEF(MVT::v8i16);
5352         First = false;
5353       }
5354       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5355                       MVT::v8i16, V, Op.getOperand(i),
5356                       DAG.getIntPtrConstant(i, dl));
5357     }
5358   }
5359 
5360   return V;
5361 }
5362 
5363 /// Custom lower build_vector of v4i32 or v4f32.
LowerBuildVectorv4x32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget,const TargetLowering & TLI)5364 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5365                                      const X86Subtarget &Subtarget,
5366                                      const TargetLowering &TLI) {
5367   // Find all zeroable elements.
5368   std::bitset<4> Zeroable;
5369   for (int i=0; i < 4; ++i) {
5370     SDValue Elt = Op->getOperand(i);
5371     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
5372   }
5373   assert(Zeroable.size() - Zeroable.count() > 1 &&
5374          "We expect at least two non-zero elements!");
5375 
5376   // We only know how to deal with build_vector nodes where elements are either
5377   // zeroable or extract_vector_elt with constant index.
5378   SDValue FirstNonZero;
5379   unsigned FirstNonZeroIdx;
5380   for (unsigned i=0; i < 4; ++i) {
5381     if (Zeroable[i])
5382       continue;
5383     SDValue Elt = Op->getOperand(i);
5384     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5385         !isa<ConstantSDNode>(Elt.getOperand(1)))
5386       return SDValue();
5387     // Make sure that this node is extracting from a 128-bit vector.
5388     MVT VT = Elt.getOperand(0).getSimpleValueType();
5389     if (!VT.is128BitVector())
5390       return SDValue();
5391     if (!FirstNonZero.getNode()) {
5392       FirstNonZero = Elt;
5393       FirstNonZeroIdx = i;
5394     }
5395   }
5396 
5397   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5398   SDValue V1 = FirstNonZero.getOperand(0);
5399   MVT VT = V1.getSimpleValueType();
5400 
5401   // See if this build_vector can be lowered as a blend with zero.
5402   SDValue Elt;
5403   unsigned EltMaskIdx, EltIdx;
5404   int Mask[4];
5405   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5406     if (Zeroable[EltIdx]) {
5407       // The zero vector will be on the right hand side.
5408       Mask[EltIdx] = EltIdx+4;
5409       continue;
5410     }
5411 
5412     Elt = Op->getOperand(EltIdx);
5413     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5414     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5415     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5416       break;
5417     Mask[EltIdx] = EltIdx;
5418   }
5419 
5420   if (EltIdx == 4) {
5421     // Let the shuffle legalizer deal with blend operations.
5422     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5423     if (V1.getSimpleValueType() != VT)
5424       V1 = DAG.getBitcast(VT, V1);
5425     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
5426   }
5427 
5428   // See if we can lower this build_vector to a INSERTPS.
5429   if (!Subtarget.hasSSE41())
5430     return SDValue();
5431 
5432   SDValue V2 = Elt.getOperand(0);
5433   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5434     V1 = SDValue();
5435 
5436   bool CanFold = true;
5437   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5438     if (Zeroable[i])
5439       continue;
5440 
5441     SDValue Current = Op->getOperand(i);
5442     SDValue SrcVector = Current->getOperand(0);
5443     if (!V1.getNode())
5444       V1 = SrcVector;
5445     CanFold = SrcVector == V1 &&
5446       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5447   }
5448 
5449   if (!CanFold)
5450     return SDValue();
5451 
5452   assert(V1.getNode() && "Expected at least two non-zero elements!");
5453   if (V1.getSimpleValueType() != MVT::v4f32)
5454     V1 = DAG.getBitcast(MVT::v4f32, V1);
5455   if (V2.getSimpleValueType() != MVT::v4f32)
5456     V2 = DAG.getBitcast(MVT::v4f32, V2);
5457 
5458   // Ok, we can emit an INSERTPS instruction.
5459   unsigned ZMask = Zeroable.to_ulong();
5460 
5461   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5462   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5463   SDLoc DL(Op);
5464   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
5465                                DAG.getIntPtrConstant(InsertPSMask, DL));
5466   return DAG.getBitcast(VT, Result);
5467 }
5468 
5469 /// Return a vector logical shift node.
getVShift(bool isLeft,EVT VT,SDValue SrcOp,unsigned NumBits,SelectionDAG & DAG,const TargetLowering & TLI,const SDLoc & dl)5470 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
5471                          SelectionDAG &DAG, const TargetLowering &TLI,
5472                          const SDLoc &dl) {
5473   assert(VT.is128BitVector() && "Unknown type for VShift");
5474   MVT ShVT = MVT::v16i8;
5475   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5476   SrcOp = DAG.getBitcast(ShVT, SrcOp);
5477   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
5478   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
5479   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
5480   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5481 }
5482 
LowerAsSplatVectorLoad(SDValue SrcOp,MVT VT,const SDLoc & dl,SelectionDAG & DAG)5483 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
5484                                       SelectionDAG &DAG) {
5485 
5486   // Check if the scalar load can be widened into a vector load. And if
5487   // the address is "base + cst" see if the cst can be "absorbed" into
5488   // the shuffle mask.
5489   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5490     SDValue Ptr = LD->getBasePtr();
5491     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5492       return SDValue();
5493     EVT PVT = LD->getValueType(0);
5494     if (PVT != MVT::i32 && PVT != MVT::f32)
5495       return SDValue();
5496 
5497     int FI = -1;
5498     int64_t Offset = 0;
5499     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5500       FI = FINode->getIndex();
5501       Offset = 0;
5502     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5503                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5504       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5505       Offset = Ptr.getConstantOperandVal(1);
5506       Ptr = Ptr.getOperand(0);
5507     } else {
5508       return SDValue();
5509     }
5510 
5511     // FIXME: 256-bit vector instructions don't require a strict alignment,
5512     // improve this code to support it better.
5513     unsigned RequiredAlign = VT.getSizeInBits()/8;
5514     SDValue Chain = LD->getChain();
5515     // Make sure the stack object alignment is at least 16 or 32.
5516     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5517     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5518       if (MFI->isFixedObjectIndex(FI)) {
5519         // Can't change the alignment. FIXME: It's possible to compute
5520         // the exact stack offset and reference FI + adjust offset instead.
5521         // If someone *really* cares about this. That's the way to implement it.
5522         return SDValue();
5523       } else {
5524         MFI->setObjectAlignment(FI, RequiredAlign);
5525       }
5526     }
5527 
5528     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5529     // Ptr + (Offset & ~15).
5530     if (Offset < 0)
5531       return SDValue();
5532     if ((Offset % RequiredAlign) & 3)
5533       return SDValue();
5534     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
5535     if (StartOffset) {
5536       SDLoc DL(Ptr);
5537       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5538                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
5539     }
5540 
5541     int EltNo = (Offset - StartOffset) >> 2;
5542     unsigned NumElems = VT.getVectorNumElements();
5543 
5544     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5545     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5546                              LD->getPointerInfo().getWithOffset(StartOffset),
5547                              false, false, false, 0);
5548 
5549     SmallVector<int, 8> Mask(NumElems, EltNo);
5550 
5551     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
5552   }
5553 
5554   return SDValue();
5555 }
5556 
5557 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
5558 /// elements can be replaced by a single large load which has the same value as
5559 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
5560 ///
5561 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
EltsFromConsecutiveLoads(EVT VT,ArrayRef<SDValue> Elts,SDLoc & DL,SelectionDAG & DAG,bool isAfterLegalize)5562 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
5563                                         SDLoc &DL, SelectionDAG &DAG,
5564                                         bool isAfterLegalize) {
5565   unsigned NumElems = Elts.size();
5566 
5567   int LastLoadedElt = -1;
5568   SmallBitVector LoadMask(NumElems, false);
5569   SmallBitVector ZeroMask(NumElems, false);
5570   SmallBitVector UndefMask(NumElems, false);
5571 
5572   // For each element in the initializer, see if we've found a load, zero or an
5573   // undef.
5574   for (unsigned i = 0; i < NumElems; ++i) {
5575     SDValue Elt = peekThroughBitcasts(Elts[i]);
5576     if (!Elt.getNode())
5577       return SDValue();
5578 
5579     if (Elt.isUndef())
5580       UndefMask[i] = true;
5581     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
5582       ZeroMask[i] = true;
5583     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
5584       LoadMask[i] = true;
5585       LastLoadedElt = i;
5586       // Each loaded element must be the correct fractional portion of the
5587       // requested vector load.
5588       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
5589         return SDValue();
5590     } else
5591       return SDValue();
5592   }
5593   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
5594          "Incomplete element masks");
5595 
5596   // Handle Special Cases - all undef or undef/zero.
5597   if (UndefMask.count() == NumElems)
5598     return DAG.getUNDEF(VT);
5599 
5600   // FIXME: Should we return this as a BUILD_VECTOR instead?
5601   if ((ZeroMask | UndefMask).count() == NumElems)
5602     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
5603                           : DAG.getConstantFP(0.0, DL, VT);
5604 
5605   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5606   int FirstLoadedElt = LoadMask.find_first();
5607   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
5608   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
5609   EVT LDBaseVT = EltBase.getValueType();
5610 
5611   // Consecutive loads can contain UNDEFS but not ZERO elements.
5612   // Consecutive loads with UNDEFs and ZEROs elements require a
5613   // an additional shuffle stage to clear the ZERO elements.
5614   bool IsConsecutiveLoad = true;
5615   bool IsConsecutiveLoadWithZeros = true;
5616   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
5617     if (LoadMask[i]) {
5618       SDValue Elt = peekThroughBitcasts(Elts[i]);
5619       LoadSDNode *LD = cast<LoadSDNode>(Elt);
5620       if (!DAG.areNonVolatileConsecutiveLoads(
5621               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
5622               i - FirstLoadedElt)) {
5623         IsConsecutiveLoad = false;
5624         IsConsecutiveLoadWithZeros = false;
5625         break;
5626       }
5627     } else if (ZeroMask[i]) {
5628       IsConsecutiveLoad = false;
5629     }
5630   }
5631 
5632   auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
5633     SDValue NewLd = DAG.getLoad(
5634         VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5635         LDBase->getPointerInfo(), false /*LDBase->isVolatile()*/,
5636         LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment());
5637 
5638     if (LDBase->hasAnyUseOfValue(1)) {
5639       SDValue NewChain =
5640           DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5641                       SDValue(NewLd.getNode(), 1));
5642       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5643       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5644                              SDValue(NewLd.getNode(), 1));
5645     }
5646 
5647     return NewLd;
5648   };
5649 
5650   // LOAD - all consecutive load/undefs (must start/end with a load).
5651   // If we have found an entire vector of loads and undefs, then return a large
5652   // load of the entire vector width starting at the base pointer.
5653   // If the vector contains zeros, then attempt to shuffle those elements.
5654   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
5655       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
5656     assert(LDBase && "Did not find base load for merging consecutive loads");
5657     EVT EltVT = LDBase->getValueType(0);
5658     // Ensure that the input vector size for the merged loads matches the
5659     // cumulative size of the input elements.
5660     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
5661       return SDValue();
5662 
5663     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
5664       return SDValue();
5665 
5666     if (IsConsecutiveLoad)
5667       return CreateLoad(VT, LDBase);
5668 
5669     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
5670     // vector and a zero vector to clear out the zero elements.
5671     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
5672       SmallVector<int, 4> ClearMask(NumElems, -1);
5673       for (unsigned i = 0; i < NumElems; ++i) {
5674         if (ZeroMask[i])
5675           ClearMask[i] = i + NumElems;
5676         else if (LoadMask[i])
5677           ClearMask[i] = i;
5678       }
5679       SDValue V = CreateLoad(VT, LDBase);
5680       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
5681                                  : DAG.getConstantFP(0.0, DL, VT);
5682       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
5683     }
5684   }
5685 
5686   int LoadSize =
5687       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
5688 
5689   // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
5690   if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
5691       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5692     MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
5693     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
5694     if (TLI.isTypeLegal(VecVT)) {
5695       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
5696       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5697       SDValue ResNode =
5698           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
5699                                   LDBase->getPointerInfo(),
5700                                   LDBase->getAlignment(),
5701                                   false/*isVolatile*/, true/*ReadMem*/,
5702                                   false/*WriteMem*/);
5703 
5704       // Make sure the newly-created LOAD is in the same position as LDBase in
5705       // terms of dependency. We create a TokenFactor for LDBase and ResNode,
5706       // and update uses of LDBase's output chain to use the TokenFactor.
5707       if (LDBase->hasAnyUseOfValue(1)) {
5708         SDValue NewChain =
5709             DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5710                         SDValue(ResNode.getNode(), 1));
5711         DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5712         DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5713                                SDValue(ResNode.getNode(), 1));
5714       }
5715 
5716       return DAG.getBitcast(VT, ResNode);
5717     }
5718   }
5719 
5720   // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
5721   if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
5722       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5723     MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
5724     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
5725     if (TLI.isTypeLegal(VecVT)) {
5726       SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
5727                                      : DAG.getBitcast(VecSVT, EltBase);
5728       V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
5729       V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
5730       return DAG.getBitcast(VT, V);
5731     }
5732   }
5733 
5734   return SDValue();
5735 }
5736 
5737 /// Attempt to use the vbroadcast instruction to generate a splat value for the
5738 /// following cases:
5739 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5740 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5741 /// a scalar load, or a constant.
5742 /// The VBROADCAST node is returned when a pattern is found,
5743 /// or SDValue() otherwise.
LowerVectorBroadcast(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)5744 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
5745                                     SelectionDAG &DAG) {
5746   // VBROADCAST requires AVX.
5747   // TODO: Splats could be generated for non-AVX CPUs using SSE
5748   // instructions, but there's less potential gain for only 128-bit vectors.
5749   if (!Subtarget.hasAVX())
5750     return SDValue();
5751 
5752   MVT VT = Op.getSimpleValueType();
5753   SDLoc dl(Op);
5754 
5755   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5756          "Unsupported vector type for broadcast.");
5757 
5758   SDValue Ld;
5759   bool ConstSplatVal;
5760 
5761   switch (Op.getOpcode()) {
5762     default:
5763       // Unknown pattern found.
5764       return SDValue();
5765 
5766     case ISD::BUILD_VECTOR: {
5767       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
5768       BitVector UndefElements;
5769       SDValue Splat = BVOp->getSplatValue(&UndefElements);
5770 
5771       // We need a splat of a single value to use broadcast, and it doesn't
5772       // make any sense if the value is only in one element of the vector.
5773       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
5774         return SDValue();
5775 
5776       Ld = Splat;
5777       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5778                        Ld.getOpcode() == ISD::ConstantFP);
5779 
5780       // Make sure that all of the users of a non-constant load are from the
5781       // BUILD_VECTOR node.
5782       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
5783         return SDValue();
5784       break;
5785     }
5786 
5787     case ISD::VECTOR_SHUFFLE: {
5788       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5789 
5790       // Shuffles must have a splat mask where the first element is
5791       // broadcasted.
5792       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5793         return SDValue();
5794 
5795       SDValue Sc = Op.getOperand(0);
5796       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5797           Sc.getOpcode() != ISD::BUILD_VECTOR) {
5798 
5799         if (!Subtarget.hasInt256())
5800           return SDValue();
5801 
5802         // Use the register form of the broadcast instruction available on AVX2.
5803         if (VT.getSizeInBits() >= 256)
5804           Sc = extract128BitVector(Sc, 0, DAG, dl);
5805         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5806       }
5807 
5808       Ld = Sc.getOperand(0);
5809       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5810                        Ld.getOpcode() == ISD::ConstantFP);
5811 
5812       // The scalar_to_vector node and the suspected
5813       // load node must have exactly one user.
5814       // Constants may have multiple users.
5815 
5816       // AVX-512 has register version of the broadcast
5817       bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
5818         Ld.getValueType().getSizeInBits() >= 32;
5819       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5820           !hasRegVer))
5821         return SDValue();
5822       break;
5823     }
5824   }
5825 
5826   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5827   bool IsGE256 = (VT.getSizeInBits() >= 256);
5828 
5829   // When optimizing for size, generate up to 5 extra bytes for a broadcast
5830   // instruction to save 8 or more bytes of constant pool data.
5831   // TODO: If multiple splats are generated to load the same constant,
5832   // it may be detrimental to overall size. There needs to be a way to detect
5833   // that condition to know if this is truly a size win.
5834   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
5835 
5836   // Handle broadcasting a single constant scalar from the constant pool
5837   // into a vector.
5838   // On Sandybridge (no AVX2), it is still better to load a constant vector
5839   // from the constant pool and not to broadcast it from a scalar.
5840   // But override that restriction when optimizing for size.
5841   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
5842   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
5843     EVT CVT = Ld.getValueType();
5844     assert(!CVT.isVector() && "Must not broadcast a vector type");
5845 
5846     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
5847     // For size optimization, also splat v2f64 and v2i64, and for size opt
5848     // with AVX2, also splat i8 and i16.
5849     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
5850     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5851         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
5852       const Constant *C = nullptr;
5853       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5854         C = CI->getConstantIntValue();
5855       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5856         C = CF->getConstantFPValue();
5857 
5858       assert(C && "Invalid constant type");
5859 
5860       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5861       SDValue CP =
5862           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
5863       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5864       Ld = DAG.getLoad(
5865           CVT, dl, DAG.getEntryNode(), CP,
5866           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
5867           false, false, Alignment);
5868 
5869       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5870     }
5871   }
5872 
5873   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5874 
5875   // Handle AVX2 in-register broadcasts.
5876   if (!IsLoad && Subtarget.hasInt256() &&
5877       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5878     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5879 
5880   // The scalar source must be a normal load.
5881   if (!IsLoad)
5882     return SDValue();
5883 
5884   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5885       (Subtarget.hasVLX() && ScalarSize == 64))
5886     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5887 
5888   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5889   // double since there is no vbroadcastsd xmm
5890   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
5891     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5892       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5893   }
5894 
5895   // Unsupported broadcast.
5896   return SDValue();
5897 }
5898 
5899 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
5900 /// underlying vector and index.
5901 ///
5902 /// Modifies \p ExtractedFromVec to the real vector and returns the real
5903 /// index.
getUnderlyingExtractedFromVec(SDValue & ExtractedFromVec,SDValue ExtIdx)5904 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
5905                                          SDValue ExtIdx) {
5906   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5907   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
5908     return Idx;
5909 
5910   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
5911   // lowered this:
5912   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
5913   // to:
5914   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
5915   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
5916   //                           undef)
5917   //                       Constant<0>)
5918   // In this case the vector is the extract_subvector expression and the index
5919   // is 2, as specified by the shuffle.
5920   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
5921   SDValue ShuffleVec = SVOp->getOperand(0);
5922   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
5923   assert(ShuffleVecVT.getVectorElementType() ==
5924          ExtractedFromVec.getSimpleValueType().getVectorElementType());
5925 
5926   int ShuffleIdx = SVOp->getMaskElt(Idx);
5927   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
5928     ExtractedFromVec = ShuffleVec;
5929     return ShuffleIdx;
5930   }
5931   return Idx;
5932 }
5933 
buildFromShuffleMostly(SDValue Op,SelectionDAG & DAG)5934 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
5935   MVT VT = Op.getSimpleValueType();
5936 
5937   // Skip if insert_vec_elt is not supported.
5938   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5939   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5940     return SDValue();
5941 
5942   SDLoc DL(Op);
5943   unsigned NumElems = Op.getNumOperands();
5944 
5945   SDValue VecIn1;
5946   SDValue VecIn2;
5947   SmallVector<unsigned, 4> InsertIndices;
5948   SmallVector<int, 8> Mask(NumElems, -1);
5949 
5950   for (unsigned i = 0; i != NumElems; ++i) {
5951     unsigned Opc = Op.getOperand(i).getOpcode();
5952 
5953     if (Opc == ISD::UNDEF)
5954       continue;
5955 
5956     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5957       // Quit if more than 1 elements need inserting.
5958       if (InsertIndices.size() > 1)
5959         return SDValue();
5960 
5961       InsertIndices.push_back(i);
5962       continue;
5963     }
5964 
5965     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5966     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5967     // Quit if non-constant index.
5968     if (!isa<ConstantSDNode>(ExtIdx))
5969       return SDValue();
5970     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
5971 
5972     // Quit if extracted from vector of different type.
5973     if (ExtractedFromVec.getValueType() != VT)
5974       return SDValue();
5975 
5976     if (!VecIn1.getNode())
5977       VecIn1 = ExtractedFromVec;
5978     else if (VecIn1 != ExtractedFromVec) {
5979       if (!VecIn2.getNode())
5980         VecIn2 = ExtractedFromVec;
5981       else if (VecIn2 != ExtractedFromVec)
5982         // Quit if more than 2 vectors to shuffle
5983         return SDValue();
5984     }
5985 
5986     if (ExtractedFromVec == VecIn1)
5987       Mask[i] = Idx;
5988     else if (ExtractedFromVec == VecIn2)
5989       Mask[i] = Idx + NumElems;
5990   }
5991 
5992   if (!VecIn1.getNode())
5993     return SDValue();
5994 
5995   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5996   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
5997   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5998     unsigned Idx = InsertIndices[i];
5999     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6000                      DAG.getIntPtrConstant(Idx, DL));
6001   }
6002 
6003   return NV;
6004 }
6005 
ConvertI1VectorToInteger(SDValue Op,SelectionDAG & DAG)6006 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6007   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6008          Op.getScalarValueSizeInBits() == 1 &&
6009          "Can not convert non-constant vector");
6010   uint64_t Immediate = 0;
6011   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6012     SDValue In = Op.getOperand(idx);
6013     if (!In.isUndef())
6014       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6015   }
6016   SDLoc dl(Op);
6017   MVT VT =
6018    MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
6019   return DAG.getConstant(Immediate, dl, VT);
6020 }
6021 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6022 SDValue
LowerBUILD_VECTORvXi1(SDValue Op,SelectionDAG & DAG) const6023 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6024 
6025   MVT VT = Op.getSimpleValueType();
6026   assert((VT.getVectorElementType() == MVT::i1) &&
6027          "Unexpected type in LowerBUILD_VECTORvXi1!");
6028 
6029   SDLoc dl(Op);
6030   if (ISD::isBuildVectorAllZeros(Op.getNode()))
6031     return DAG.getTargetConstant(0, dl, VT);
6032 
6033   if (ISD::isBuildVectorAllOnes(Op.getNode()))
6034     return DAG.getTargetConstant(1, dl, VT);
6035 
6036   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6037     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6038     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6039       return DAG.getBitcast(VT, Imm);
6040     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6041     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6042                         DAG.getIntPtrConstant(0, dl));
6043   }
6044 
6045   // Vector has one or more non-const elements
6046   uint64_t Immediate = 0;
6047   SmallVector<unsigned, 16> NonConstIdx;
6048   bool IsSplat = true;
6049   bool HasConstElts = false;
6050   int SplatIdx = -1;
6051   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6052     SDValue In = Op.getOperand(idx);
6053     if (In.isUndef())
6054       continue;
6055     if (!isa<ConstantSDNode>(In))
6056       NonConstIdx.push_back(idx);
6057     else {
6058       Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6059       HasConstElts = true;
6060     }
6061     if (SplatIdx < 0)
6062       SplatIdx = idx;
6063     else if (In != Op.getOperand(SplatIdx))
6064       IsSplat = false;
6065   }
6066 
6067   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6068   if (IsSplat)
6069     return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6070                        DAG.getConstant(1, dl, VT),
6071                        DAG.getConstant(0, dl, VT));
6072 
6073   // insert elements one by one
6074   SDValue DstVec;
6075   SDValue Imm;
6076   if (Immediate) {
6077     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6078     Imm = DAG.getConstant(Immediate, dl, ImmVT);
6079   }
6080   else if (HasConstElts)
6081     Imm = DAG.getConstant(0, dl, VT);
6082   else
6083     Imm = DAG.getUNDEF(VT);
6084   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6085     DstVec = DAG.getBitcast(VT, Imm);
6086   else {
6087     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6088     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6089                          DAG.getIntPtrConstant(0, dl));
6090   }
6091 
6092   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6093     unsigned InsertIdx = NonConstIdx[i];
6094     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6095                          Op.getOperand(InsertIdx),
6096                          DAG.getIntPtrConstant(InsertIdx, dl));
6097   }
6098   return DstVec;
6099 }
6100 
6101 /// \brief Return true if \p N implements a horizontal binop and return the
6102 /// operands for the horizontal binop into V0 and V1.
6103 ///
6104 /// This is a helper function of LowerToHorizontalOp().
6105 /// This function checks that the build_vector \p N in input implements a
6106 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6107 /// operation to match.
6108 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6109 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6110 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6111 /// arithmetic sub.
6112 ///
6113 /// This function only analyzes elements of \p N whose indices are
6114 /// in range [BaseIdx, LastIdx).
isHorizontalBinOp(const BuildVectorSDNode * N,unsigned Opcode,SelectionDAG & DAG,unsigned BaseIdx,unsigned LastIdx,SDValue & V0,SDValue & V1)6115 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6116                               SelectionDAG &DAG,
6117                               unsigned BaseIdx, unsigned LastIdx,
6118                               SDValue &V0, SDValue &V1) {
6119   EVT VT = N->getValueType(0);
6120 
6121   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6122   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6123          "Invalid Vector in input!");
6124 
6125   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6126   bool CanFold = true;
6127   unsigned ExpectedVExtractIdx = BaseIdx;
6128   unsigned NumElts = LastIdx - BaseIdx;
6129   V0 = DAG.getUNDEF(VT);
6130   V1 = DAG.getUNDEF(VT);
6131 
6132   // Check if N implements a horizontal binop.
6133   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6134     SDValue Op = N->getOperand(i + BaseIdx);
6135 
6136     // Skip UNDEFs.
6137     if (Op->isUndef()) {
6138       // Update the expected vector extract index.
6139       if (i * 2 == NumElts)
6140         ExpectedVExtractIdx = BaseIdx;
6141       ExpectedVExtractIdx += 2;
6142       continue;
6143     }
6144 
6145     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6146 
6147     if (!CanFold)
6148       break;
6149 
6150     SDValue Op0 = Op.getOperand(0);
6151     SDValue Op1 = Op.getOperand(1);
6152 
6153     // Try to match the following pattern:
6154     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6155     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6156         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6157         Op0.getOperand(0) == Op1.getOperand(0) &&
6158         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6159         isa<ConstantSDNode>(Op1.getOperand(1)));
6160     if (!CanFold)
6161       break;
6162 
6163     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6164     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6165 
6166     if (i * 2 < NumElts) {
6167       if (V0.isUndef()) {
6168         V0 = Op0.getOperand(0);
6169         if (V0.getValueType() != VT)
6170           return false;
6171       }
6172     } else {
6173       if (V1.isUndef()) {
6174         V1 = Op0.getOperand(0);
6175         if (V1.getValueType() != VT)
6176           return false;
6177       }
6178       if (i * 2 == NumElts)
6179         ExpectedVExtractIdx = BaseIdx;
6180     }
6181 
6182     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6183     if (I0 == ExpectedVExtractIdx)
6184       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6185     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6186       // Try to match the following dag sequence:
6187       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6188       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6189     } else
6190       CanFold = false;
6191 
6192     ExpectedVExtractIdx += 2;
6193   }
6194 
6195   return CanFold;
6196 }
6197 
6198 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6199 /// a concat_vector.
6200 ///
6201 /// This is a helper function of LowerToHorizontalOp().
6202 /// This function expects two 256-bit vectors called V0 and V1.
6203 /// At first, each vector is split into two separate 128-bit vectors.
6204 /// Then, the resulting 128-bit vectors are used to implement two
6205 /// horizontal binary operations.
6206 ///
6207 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6208 ///
6209 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6210 /// the two new horizontal binop.
6211 /// When Mode is set, the first horizontal binop dag node would take as input
6212 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6213 /// horizontal binop dag node would take as input the lower 128-bit of V1
6214 /// and the upper 128-bit of V1.
6215 ///   Example:
6216 ///     HADD V0_LO, V0_HI
6217 ///     HADD V1_LO, V1_HI
6218 ///
6219 /// Otherwise, the first horizontal binop dag node takes as input the lower
6220 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6221 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6222 ///   Example:
6223 ///     HADD V0_LO, V1_LO
6224 ///     HADD V0_HI, V1_HI
6225 ///
6226 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6227 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6228 /// the upper 128-bits of the result.
ExpandHorizontalBinOp(const SDValue & V0,const SDValue & V1,const SDLoc & DL,SelectionDAG & DAG,unsigned X86Opcode,bool Mode,bool isUndefLO,bool isUndefHI)6229 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6230                                      const SDLoc &DL, SelectionDAG &DAG,
6231                                      unsigned X86Opcode, bool Mode,
6232                                      bool isUndefLO, bool isUndefHI) {
6233   MVT VT = V0.getSimpleValueType();
6234   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6235          "Invalid nodes in input!");
6236 
6237   unsigned NumElts = VT.getVectorNumElements();
6238   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6239   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6240   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6241   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6242   MVT NewVT = V0_LO.getSimpleValueType();
6243 
6244   SDValue LO = DAG.getUNDEF(NewVT);
6245   SDValue HI = DAG.getUNDEF(NewVT);
6246 
6247   if (Mode) {
6248     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6249     if (!isUndefLO && !V0->isUndef())
6250       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6251     if (!isUndefHI && !V1->isUndef())
6252       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6253   } else {
6254     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6255     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6256       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6257 
6258     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6259       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6260   }
6261 
6262   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6263 }
6264 
6265 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6266 /// node.
LowerToAddSub(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)6267 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6268                              const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6269   MVT VT = BV->getSimpleValueType(0);
6270   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6271       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6272     return SDValue();
6273 
6274   SDLoc DL(BV);
6275   unsigned NumElts = VT.getVectorNumElements();
6276   SDValue InVec0 = DAG.getUNDEF(VT);
6277   SDValue InVec1 = DAG.getUNDEF(VT);
6278 
6279   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6280           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6281 
6282   // Odd-numbered elements in the input build vector are obtained from
6283   // adding two integer/float elements.
6284   // Even-numbered elements in the input build vector are obtained from
6285   // subtracting two integer/float elements.
6286   unsigned ExpectedOpcode = ISD::FSUB;
6287   unsigned NextExpectedOpcode = ISD::FADD;
6288   bool AddFound = false;
6289   bool SubFound = false;
6290 
6291   for (unsigned i = 0, e = NumElts; i != e; ++i) {
6292     SDValue Op = BV->getOperand(i);
6293 
6294     // Skip 'undef' values.
6295     unsigned Opcode = Op.getOpcode();
6296     if (Opcode == ISD::UNDEF) {
6297       std::swap(ExpectedOpcode, NextExpectedOpcode);
6298       continue;
6299     }
6300 
6301     // Early exit if we found an unexpected opcode.
6302     if (Opcode != ExpectedOpcode)
6303       return SDValue();
6304 
6305     SDValue Op0 = Op.getOperand(0);
6306     SDValue Op1 = Op.getOperand(1);
6307 
6308     // Try to match the following pattern:
6309     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6310     // Early exit if we cannot match that sequence.
6311     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6312         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6313         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6314         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6315         Op0.getOperand(1) != Op1.getOperand(1))
6316       return SDValue();
6317 
6318     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6319     if (I0 != i)
6320       return SDValue();
6321 
6322     // We found a valid add/sub node. Update the information accordingly.
6323     if (i & 1)
6324       AddFound = true;
6325     else
6326       SubFound = true;
6327 
6328     // Update InVec0 and InVec1.
6329     if (InVec0.isUndef()) {
6330       InVec0 = Op0.getOperand(0);
6331       if (InVec0.getSimpleValueType() != VT)
6332         return SDValue();
6333     }
6334     if (InVec1.isUndef()) {
6335       InVec1 = Op1.getOperand(0);
6336       if (InVec1.getSimpleValueType() != VT)
6337         return SDValue();
6338     }
6339 
6340     // Make sure that operands in input to each add/sub node always
6341     // come from a same pair of vectors.
6342     if (InVec0 != Op0.getOperand(0)) {
6343       if (ExpectedOpcode == ISD::FSUB)
6344         return SDValue();
6345 
6346       // FADD is commutable. Try to commute the operands
6347       // and then test again.
6348       std::swap(Op0, Op1);
6349       if (InVec0 != Op0.getOperand(0))
6350         return SDValue();
6351     }
6352 
6353     if (InVec1 != Op1.getOperand(0))
6354       return SDValue();
6355 
6356     // Update the pair of expected opcodes.
6357     std::swap(ExpectedOpcode, NextExpectedOpcode);
6358   }
6359 
6360   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6361   if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
6362     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6363 
6364   return SDValue();
6365 }
6366 
6367 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
LowerToHorizontalOp(const BuildVectorSDNode * BV,const X86Subtarget & Subtarget,SelectionDAG & DAG)6368 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
6369                                    const X86Subtarget &Subtarget,
6370                                    SelectionDAG &DAG) {
6371   MVT VT = BV->getSimpleValueType(0);
6372   unsigned NumElts = VT.getVectorNumElements();
6373   unsigned NumUndefsLO = 0;
6374   unsigned NumUndefsHI = 0;
6375   unsigned Half = NumElts/2;
6376 
6377   // Count the number of UNDEF operands in the build_vector in input.
6378   for (unsigned i = 0, e = Half; i != e; ++i)
6379     if (BV->getOperand(i)->isUndef())
6380       NumUndefsLO++;
6381 
6382   for (unsigned i = Half, e = NumElts; i != e; ++i)
6383     if (BV->getOperand(i)->isUndef())
6384       NumUndefsHI++;
6385 
6386   // Early exit if this is either a build_vector of all UNDEFs or all the
6387   // operands but one are UNDEF.
6388   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6389     return SDValue();
6390 
6391   SDLoc DL(BV);
6392   SDValue InVec0, InVec1;
6393   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
6394     // Try to match an SSE3 float HADD/HSUB.
6395     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6396       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6397 
6398     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6399       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6400   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
6401     // Try to match an SSSE3 integer HADD/HSUB.
6402     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6403       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6404 
6405     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6406       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6407   }
6408 
6409   if (!Subtarget.hasAVX())
6410     return SDValue();
6411 
6412   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6413     // Try to match an AVX horizontal add/sub of packed single/double
6414     // precision floating point values from 256-bit vectors.
6415     SDValue InVec2, InVec3;
6416     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6417         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6418         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6419         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6420       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6421 
6422     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6423         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6424         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6425         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6426       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6427   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6428     // Try to match an AVX2 horizontal add/sub of signed integers.
6429     SDValue InVec2, InVec3;
6430     unsigned X86Opcode;
6431     bool CanFold = true;
6432 
6433     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6434         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6435         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6436         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6437       X86Opcode = X86ISD::HADD;
6438     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6439         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6440         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6441         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6442       X86Opcode = X86ISD::HSUB;
6443     else
6444       CanFold = false;
6445 
6446     if (CanFold) {
6447       // Fold this build_vector into a single horizontal add/sub.
6448       // Do this only if the target has AVX2.
6449       if (Subtarget.hasAVX2())
6450         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6451 
6452       // Do not try to expand this build_vector into a pair of horizontal
6453       // add/sub if we can emit a pair of scalar add/sub.
6454       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6455         return SDValue();
6456 
6457       // Convert this build_vector into a pair of horizontal binop followed by
6458       // a concat vector.
6459       bool isUndefLO = NumUndefsLO == Half;
6460       bool isUndefHI = NumUndefsHI == Half;
6461       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6462                                    isUndefLO, isUndefHI);
6463     }
6464   }
6465 
6466   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6467        VT == MVT::v16i16) && Subtarget.hasAVX()) {
6468     unsigned X86Opcode;
6469     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6470       X86Opcode = X86ISD::HADD;
6471     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6472       X86Opcode = X86ISD::HSUB;
6473     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6474       X86Opcode = X86ISD::FHADD;
6475     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6476       X86Opcode = X86ISD::FHSUB;
6477     else
6478       return SDValue();
6479 
6480     // Don't try to expand this build_vector into a pair of horizontal add/sub
6481     // if we can simply emit a pair of scalar add/sub.
6482     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6483       return SDValue();
6484 
6485     // Convert this build_vector into two horizontal add/sub followed by
6486     // a concat vector.
6487     bool isUndefLO = NumUndefsLO == Half;
6488     bool isUndefHI = NumUndefsHI == Half;
6489     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6490                                  isUndefLO, isUndefHI);
6491   }
6492 
6493   return SDValue();
6494 }
6495 
6496 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
6497 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
6498 /// just apply the bit to the vectors.
6499 /// NOTE: Its not in our interest to start make a general purpose vectorizer
6500 /// from this, but enough scalar bit operations are created from the later
6501 /// legalization + scalarization stages to need basic support.
lowerBuildVectorToBitOp(SDValue Op,SelectionDAG & DAG)6502 static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
6503   SDLoc DL(Op);
6504   MVT VT = Op.getSimpleValueType();
6505   unsigned NumElems = VT.getVectorNumElements();
6506   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6507 
6508   // Check that all elements have the same opcode.
6509   // TODO: Should we allow UNDEFS and if so how many?
6510   unsigned Opcode = Op.getOperand(0).getOpcode();
6511   for (unsigned i = 1; i < NumElems; ++i)
6512     if (Opcode != Op.getOperand(i).getOpcode())
6513       return SDValue();
6514 
6515   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
6516   switch (Opcode) {
6517   default:
6518     return SDValue();
6519   case ISD::AND:
6520   case ISD::XOR:
6521   case ISD::OR:
6522     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
6523       return SDValue();
6524     break;
6525   }
6526 
6527   SmallVector<SDValue, 4> LHSElts, RHSElts;
6528   for (SDValue Elt : Op->ops()) {
6529     SDValue LHS = Elt.getOperand(0);
6530     SDValue RHS = Elt.getOperand(1);
6531 
6532     // We expect the canonicalized RHS operand to be the constant.
6533     if (!isa<ConstantSDNode>(RHS))
6534       return SDValue();
6535     LHSElts.push_back(LHS);
6536     RHSElts.push_back(RHS);
6537   }
6538 
6539   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
6540   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
6541   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
6542 }
6543 
6544 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
6545 /// functionality to do this, so it's all zeros, all ones, or some derivation
6546 /// that is cheap to calculate.
materializeVectorConstant(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)6547 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
6548                                          const X86Subtarget &Subtarget) {
6549   SDLoc DL(Op);
6550   MVT VT = Op.getSimpleValueType();
6551 
6552   // Vectors containing all zeros can be matched by pxor and xorps.
6553   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6554     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6555     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6556     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6557       return Op;
6558 
6559     return getZeroVector(VT, Subtarget, DAG, DL);
6560   }
6561 
6562   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6563   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6564   // vpcmpeqd on 256-bit vectors.
6565   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6566     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
6567         (VT == MVT::v8i32 && Subtarget.hasInt256()))
6568       return Op;
6569 
6570     return getOnesVector(VT, Subtarget, DAG, DL);
6571   }
6572 
6573   return SDValue();
6574 }
6575 
6576 SDValue
LowerBUILD_VECTOR(SDValue Op,SelectionDAG & DAG) const6577 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6578   SDLoc dl(Op);
6579 
6580   MVT VT = Op.getSimpleValueType();
6581   MVT ExtVT = VT.getVectorElementType();
6582   unsigned NumElems = Op.getNumOperands();
6583 
6584   // Generate vectors for predicate vectors.
6585   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
6586     return LowerBUILD_VECTORvXi1(Op, DAG);
6587 
6588   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
6589     return VectorConstant;
6590 
6591   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
6592   if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
6593     return AddSub;
6594   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
6595     return HorizontalOp;
6596   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
6597     return Broadcast;
6598   if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
6599     return BitOp;
6600 
6601   unsigned EVTBits = ExtVT.getSizeInBits();
6602 
6603   unsigned NumZero  = 0;
6604   unsigned NumNonZero = 0;
6605   uint64_t NonZeros = 0;
6606   bool IsAllConstants = true;
6607   SmallSet<SDValue, 8> Values;
6608   for (unsigned i = 0; i < NumElems; ++i) {
6609     SDValue Elt = Op.getOperand(i);
6610     if (Elt.isUndef())
6611       continue;
6612     Values.insert(Elt);
6613     if (Elt.getOpcode() != ISD::Constant &&
6614         Elt.getOpcode() != ISD::ConstantFP)
6615       IsAllConstants = false;
6616     if (X86::isZeroNode(Elt))
6617       NumZero++;
6618     else {
6619       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
6620       NonZeros |= ((uint64_t)1 << i);
6621       NumNonZero++;
6622     }
6623   }
6624 
6625   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6626   if (NumNonZero == 0)
6627     return DAG.getUNDEF(VT);
6628 
6629   // Special case for single non-zero, non-undef, element.
6630   if (NumNonZero == 1) {
6631     unsigned Idx = countTrailingZeros(NonZeros);
6632     SDValue Item = Op.getOperand(Idx);
6633 
6634     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6635     // the value are obviously zero, truncate the value to i32 and do the
6636     // insertion that way.  Only do this if the value is non-constant or if the
6637     // value is a constant being inserted into element 0.  It is cheaper to do
6638     // a constant pool load than it is to do a movd + shuffle.
6639     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
6640         (!IsAllConstants || Idx == 0)) {
6641       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6642         // Handle SSE only.
6643         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6644         MVT VecVT = MVT::v4i32;
6645 
6646         // Truncate the value (which may itself be a constant) to i32, and
6647         // convert it to a vector with movd (S2V+shuffle to zero extend).
6648         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6649         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6650         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
6651                                       Item, Idx * 2, true, Subtarget, DAG));
6652       }
6653     }
6654 
6655     // If we have a constant or non-constant insertion into the low element of
6656     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6657     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6658     // depending on what the source datatype is.
6659     if (Idx == 0) {
6660       if (NumZero == 0)
6661         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6662 
6663       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6664           (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
6665         if (VT.is512BitVector()) {
6666           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6667           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6668                              Item, DAG.getIntPtrConstant(0, dl));
6669         }
6670         assert((VT.is128BitVector() || VT.is256BitVector()) &&
6671                "Expected an SSE value type!");
6672         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6673         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6674         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6675       }
6676 
6677       // We can't directly insert an i8 or i16 into a vector, so zero extend
6678       // it to i32 first.
6679       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6680         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
6681         if (VT.getSizeInBits() >= 256) {
6682           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
6683           if (Subtarget.hasAVX()) {
6684             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
6685             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6686           } else {
6687             // Without AVX, we need to extend to a 128-bit vector and then
6688             // insert into the 256-bit vector.
6689             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6690             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
6691             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
6692           }
6693         } else {
6694           assert(VT.is128BitVector() && "Expected an SSE value type!");
6695           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6696           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6697         }
6698         return DAG.getBitcast(VT, Item);
6699       }
6700     }
6701 
6702     // Is it a vector logical left shift?
6703     if (NumElems == 2 && Idx == 1 &&
6704         X86::isZeroNode(Op.getOperand(0)) &&
6705         !X86::isZeroNode(Op.getOperand(1))) {
6706       unsigned NumBits = VT.getSizeInBits();
6707       return getVShift(true, VT,
6708                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6709                                    VT, Op.getOperand(1)),
6710                        NumBits/2, DAG, *this, dl);
6711     }
6712 
6713     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
6714       return SDValue();
6715 
6716     // Otherwise, if this is a vector with i32 or f32 elements, and the element
6717     // is a non-constant being inserted into an element other than the low one,
6718     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
6719     // movd/movss) to move this into the low element, then shuffle it into
6720     // place.
6721     if (EVTBits == 32) {
6722       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6723       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
6724     }
6725   }
6726 
6727   // Splat is obviously ok. Let legalizer expand it to a shuffle.
6728   if (Values.size() == 1) {
6729     if (EVTBits == 32) {
6730       // Instead of a shuffle like this:
6731       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
6732       // Check if it's possible to issue this instead.
6733       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
6734       unsigned Idx = countTrailingZeros(NonZeros);
6735       SDValue Item = Op.getOperand(Idx);
6736       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
6737         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
6738     }
6739     return SDValue();
6740   }
6741 
6742   // A vector full of immediates; various special cases are already
6743   // handled, so this is best done with a single constant-pool load.
6744   if (IsAllConstants)
6745     return SDValue();
6746 
6747   // See if we can use a vector load to get all of the elements.
6748   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
6749     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6750     if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
6751       return LD;
6752   }
6753 
6754   // For AVX-length vectors, build the individual 128-bit pieces and use
6755   // shuffles to put them in place.
6756   if (VT.is256BitVector() || VT.is512BitVector()) {
6757     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6758 
6759     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
6760 
6761     // Build both the lower and upper subvector.
6762     SDValue Lower =
6763         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
6764     SDValue Upper = DAG.getBuildVector(
6765         HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
6766 
6767     // Recreate the wider vector with the lower and upper part.
6768     if (VT.is256BitVector())
6769       return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6770     return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6771   }
6772 
6773   // Let legalizer expand 2-wide build_vectors.
6774   if (EVTBits == 64) {
6775     if (NumNonZero == 1) {
6776       // One half is zero or undef.
6777       unsigned Idx = countTrailingZeros(NonZeros);
6778       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
6779                                Op.getOperand(Idx));
6780       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
6781     }
6782     return SDValue();
6783   }
6784 
6785   // If element VT is < 32 bits, convert it to inserts into a zero vector.
6786   if (EVTBits == 8 && NumElems == 16)
6787     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
6788                                           DAG, Subtarget, *this))
6789       return V;
6790 
6791   if (EVTBits == 16 && NumElems == 8)
6792     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
6793                                           DAG, Subtarget, *this))
6794       return V;
6795 
6796   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
6797   if (EVTBits == 32 && NumElems == 4)
6798     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
6799       return V;
6800 
6801   // If element VT is == 32 bits, turn it into a number of shuffles.
6802   if (NumElems == 4 && NumZero > 0) {
6803     SmallVector<SDValue, 8> Ops(NumElems);
6804     for (unsigned i = 0; i < 4; ++i) {
6805       bool isZero = !(NonZeros & (1ULL << i));
6806       if (isZero)
6807         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
6808       else
6809         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6810     }
6811 
6812     for (unsigned i = 0; i < 2; ++i) {
6813       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
6814         default: break;
6815         case 0:
6816           Ops[i] = Ops[i*2];  // Must be a zero vector.
6817           break;
6818         case 1:
6819           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
6820           break;
6821         case 2:
6822           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6823           break;
6824         case 3:
6825           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6826           break;
6827       }
6828     }
6829 
6830     bool Reverse1 = (NonZeros & 0x3) == 2;
6831     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
6832     int MaskVec[] = {
6833       Reverse1 ? 1 : 0,
6834       Reverse1 ? 0 : 1,
6835       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6836       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
6837     };
6838     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
6839   }
6840 
6841   if (Values.size() > 1 && VT.is128BitVector()) {
6842     // Check for a build vector from mostly shuffle plus few inserting.
6843     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
6844       return Sh;
6845 
6846     // For SSE 4.1, use insertps to put the high elements into the low element.
6847     if (Subtarget.hasSSE41()) {
6848       SDValue Result;
6849       if (!Op.getOperand(0).isUndef())
6850         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6851       else
6852         Result = DAG.getUNDEF(VT);
6853 
6854       for (unsigned i = 1; i < NumElems; ++i) {
6855         if (Op.getOperand(i).isUndef()) continue;
6856         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6857                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6858       }
6859       return Result;
6860     }
6861 
6862     // Otherwise, expand into a number of unpckl*, start by extending each of
6863     // our (non-undef) elements to the full vector width with the element in the
6864     // bottom slot of the vector (which generates no code for SSE).
6865     SmallVector<SDValue, 8> Ops(NumElems);
6866     for (unsigned i = 0; i < NumElems; ++i) {
6867       if (!Op.getOperand(i).isUndef())
6868         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6869       else
6870         Ops[i] = DAG.getUNDEF(VT);
6871     }
6872 
6873     // Next, we iteratively mix elements, e.g. for v4f32:
6874     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6875     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6876     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
6877     unsigned EltStride = NumElems >> 1;
6878     while (EltStride != 0) {
6879       for (unsigned i = 0; i < EltStride; ++i) {
6880         // If Ops[i+EltStride] is undef and this is the first round of mixing,
6881         // then it is safe to just drop this shuffle: V[i] is already in the
6882         // right place, the one element (since it's the first round) being
6883         // inserted as undef can be dropped.  This isn't safe for successive
6884         // rounds because they will permute elements within both vectors.
6885         if (Ops[i+EltStride].isUndef() &&
6886             EltStride == NumElems/2)
6887           continue;
6888 
6889         Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
6890       }
6891       EltStride >>= 1;
6892     }
6893     return Ops[0];
6894   }
6895   return SDValue();
6896 }
6897 
6898 // 256-bit AVX can use the vinsertf128 instruction
6899 // to create 256-bit vectors from two other 128-bit ones.
LowerAVXCONCAT_VECTORS(SDValue Op,SelectionDAG & DAG)6900 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6901   SDLoc dl(Op);
6902   MVT ResVT = Op.getSimpleValueType();
6903 
6904   assert((ResVT.is256BitVector() ||
6905           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6906 
6907   SDValue V1 = Op.getOperand(0);
6908   SDValue V2 = Op.getOperand(1);
6909   unsigned NumElems = ResVT.getVectorNumElements();
6910   if (ResVT.is256BitVector())
6911     return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6912 
6913   if (Op.getNumOperands() == 4) {
6914     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6915                                   ResVT.getVectorNumElements()/2);
6916     SDValue V3 = Op.getOperand(2);
6917     SDValue V4 = Op.getOperand(3);
6918     return concat256BitVectors(
6919         concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
6920         concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
6921         NumElems, DAG, dl);
6922   }
6923   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6924 }
6925 
LowerCONCAT_VECTORSvXi1(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)6926 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
6927                                        const X86Subtarget &Subtarget,
6928                                        SelectionDAG & DAG) {
6929   SDLoc dl(Op);
6930   MVT ResVT = Op.getSimpleValueType();
6931   unsigned NumOfOperands = Op.getNumOperands();
6932 
6933   assert(isPowerOf2_32(NumOfOperands) &&
6934          "Unexpected number of operands in CONCAT_VECTORS");
6935 
6936   SDValue Undef = DAG.getUNDEF(ResVT);
6937   if (NumOfOperands > 2) {
6938     // Specialize the cases when all, or all but one, of the operands are undef.
6939     unsigned NumOfDefinedOps = 0;
6940     unsigned OpIdx = 0;
6941     for (unsigned i = 0; i < NumOfOperands; i++)
6942       if (!Op.getOperand(i).isUndef()) {
6943         NumOfDefinedOps++;
6944         OpIdx = i;
6945       }
6946     if (NumOfDefinedOps == 0)
6947       return Undef;
6948     if (NumOfDefinedOps == 1) {
6949       unsigned SubVecNumElts =
6950         Op.getOperand(OpIdx).getValueType().getVectorNumElements();
6951       SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
6952       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
6953                          Op.getOperand(OpIdx), IdxVal);
6954     }
6955 
6956     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6957                                   ResVT.getVectorNumElements()/2);
6958     SmallVector<SDValue, 2> Ops;
6959     for (unsigned i = 0; i < NumOfOperands/2; i++)
6960       Ops.push_back(Op.getOperand(i));
6961     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6962     Ops.clear();
6963     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
6964       Ops.push_back(Op.getOperand(i));
6965     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6966     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
6967   }
6968 
6969   // 2 operands
6970   SDValue V1 = Op.getOperand(0);
6971   SDValue V2 = Op.getOperand(1);
6972   unsigned NumElems = ResVT.getVectorNumElements();
6973   assert(V1.getValueType() == V2.getValueType() &&
6974          V1.getValueType().getVectorNumElements() == NumElems/2 &&
6975          "Unexpected operands in CONCAT_VECTORS");
6976 
6977   if (ResVT.getSizeInBits() >= 16)
6978     return Op; // The operation is legal with KUNPCK
6979 
6980   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
6981   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
6982   SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
6983   if (IsZeroV1 && IsZeroV2)
6984     return ZeroVec;
6985 
6986   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6987   if (V2.isUndef())
6988     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
6989   if (IsZeroV2)
6990     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
6991 
6992   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
6993   if (V1.isUndef())
6994     V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
6995 
6996   if (IsZeroV1)
6997     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
6998 
6999   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7000   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7001 }
7002 
LowerCONCAT_VECTORS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)7003 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7004                                    const X86Subtarget &Subtarget,
7005                                    SelectionDAG &DAG) {
7006   MVT VT = Op.getSimpleValueType();
7007   if (VT.getVectorElementType() == MVT::i1)
7008     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7009 
7010   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7011          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7012           Op.getNumOperands() == 4)));
7013 
7014   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7015   // from two other 128-bit ones.
7016 
7017   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7018   return LowerAVXCONCAT_VECTORS(Op, DAG);
7019 }
7020 
7021 //===----------------------------------------------------------------------===//
7022 // Vector shuffle lowering
7023 //
7024 // This is an experimental code path for lowering vector shuffles on x86. It is
7025 // designed to handle arbitrary vector shuffles and blends, gracefully
7026 // degrading performance as necessary. It works hard to recognize idiomatic
7027 // shuffles and lower them to optimal instruction patterns without leaving
7028 // a framework that allows reasonably efficient handling of all vector shuffle
7029 // patterns.
7030 //===----------------------------------------------------------------------===//
7031 
7032 /// \brief Tiny helper function to identify a no-op mask.
7033 ///
7034 /// This is a somewhat boring predicate function. It checks whether the mask
7035 /// array input, which is assumed to be a single-input shuffle mask of the kind
7036 /// used by the X86 shuffle instructions (not a fully general
7037 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7038 /// in-place shuffle are 'no-op's.
isNoopShuffleMask(ArrayRef<int> Mask)7039 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7040   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7041     assert(Mask[i] >= -1 && "Out of bound mask element!");
7042     if (Mask[i] >= 0 && Mask[i] != i)
7043       return false;
7044   }
7045   return true;
7046 }
7047 
7048 /// \brief Test whether there are elements crossing 128-bit lanes in this
7049 /// shuffle mask.
7050 ///
7051 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7052 /// and we routinely test for these.
is128BitLaneCrossingShuffleMask(MVT VT,ArrayRef<int> Mask)7053 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7054   int LaneSize = 128 / VT.getScalarSizeInBits();
7055   int Size = Mask.size();
7056   for (int i = 0; i < Size; ++i)
7057     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7058       return true;
7059   return false;
7060 }
7061 
7062 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7063 ///
7064 /// This checks a shuffle mask to see if it is performing the same
7065 /// lane-relative shuffle in each sub-lane. This trivially implies
7066 /// that it is also not lane-crossing. It may however involve a blend from the
7067 /// same lane of a second vector.
7068 ///
7069 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7070 /// non-trivial to compute in the face of undef lanes. The representation is
7071 /// suitable for use with existing 128-bit shuffles as entries from the second
7072 /// vector have been remapped to [LaneSize, 2*LaneSize).
isRepeatedShuffleMask(unsigned LaneSizeInBits,MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)7073 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7074                                   ArrayRef<int> Mask,
7075                                   SmallVectorImpl<int> &RepeatedMask) {
7076   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7077   RepeatedMask.assign(LaneSize, -1);
7078   int Size = Mask.size();
7079   for (int i = 0; i < Size; ++i) {
7080     if (Mask[i] < 0)
7081       continue;
7082     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7083       // This entry crosses lanes, so there is no way to model this shuffle.
7084       return false;
7085 
7086     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7087     // Adjust second vector indices to start at LaneSize instead of Size.
7088     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7089                                 : Mask[i] % LaneSize + LaneSize;
7090     if (RepeatedMask[i % LaneSize] < 0)
7091       // This is the first non-undef entry in this slot of a 128-bit lane.
7092       RepeatedMask[i % LaneSize] = LocalM;
7093     else if (RepeatedMask[i % LaneSize] != LocalM)
7094       // Found a mismatch with the repeated mask.
7095       return false;
7096   }
7097   return true;
7098 }
7099 
7100 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7101 static bool
is128BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)7102 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7103                                 SmallVectorImpl<int> &RepeatedMask) {
7104   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7105 }
7106 
7107 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7108 static bool
is256BitLaneRepeatedShuffleMask(MVT VT,ArrayRef<int> Mask,SmallVectorImpl<int> & RepeatedMask)7109 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7110                                 SmallVectorImpl<int> &RepeatedMask) {
7111   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7112 }
7113 
scaleShuffleMask(int Scale,ArrayRef<int> Mask,SmallVectorImpl<int> & ScaledMask)7114 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
7115                              SmallVectorImpl<int> &ScaledMask) {
7116   assert(0 < Scale && "Unexpected scaling factor");
7117   int NumElts = Mask.size();
7118   ScaledMask.assign(NumElts * Scale, -1);
7119 
7120   for (int i = 0; i != NumElts; ++i) {
7121     int M = Mask[i];
7122 
7123     // Repeat sentinel values in every mask element.
7124     if (M < 0) {
7125       for (int s = 0; s != Scale; ++s)
7126         ScaledMask[(Scale * i) + s] = M;
7127       continue;
7128     }
7129 
7130     // Scale mask element and increment across each mask element.
7131     for (int s = 0; s != Scale; ++s)
7132       ScaledMask[(Scale * i) + s] = (Scale * M) + s;
7133   }
7134 }
7135 
7136 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7137 /// arguments.
7138 ///
7139 /// This is a fast way to test a shuffle mask against a fixed pattern:
7140 ///
7141 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7142 ///
7143 /// It returns true if the mask is exactly as wide as the argument list, and
7144 /// each element of the mask is either -1 (signifying undef) or the value given
7145 /// in the argument.
isShuffleEquivalent(SDValue V1,SDValue V2,ArrayRef<int> Mask,ArrayRef<int> ExpectedMask)7146 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7147                                 ArrayRef<int> ExpectedMask) {
7148   if (Mask.size() != ExpectedMask.size())
7149     return false;
7150 
7151   int Size = Mask.size();
7152 
7153   // If the values are build vectors, we can look through them to find
7154   // equivalent inputs that make the shuffles equivalent.
7155   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7156   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7157 
7158   for (int i = 0; i < Size; ++i) {
7159     assert(Mask[i] >= -1 && "Out of bound mask element!");
7160     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7161       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7162       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7163       if (!MaskBV || !ExpectedBV ||
7164           MaskBV->getOperand(Mask[i] % Size) !=
7165               ExpectedBV->getOperand(ExpectedMask[i] % Size))
7166         return false;
7167     }
7168 }
7169 
7170   return true;
7171 }
7172 
7173 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7174 ///
7175 /// The masks must be exactly the same width.
7176 ///
7177 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7178 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7179 ///
7180 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
isTargetShuffleEquivalent(ArrayRef<int> Mask,ArrayRef<int> ExpectedMask)7181 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7182                                       ArrayRef<int> ExpectedMask) {
7183   int Size = Mask.size();
7184   if (Size != (int)ExpectedMask.size())
7185     return false;
7186 
7187   for (int i = 0; i < Size; ++i)
7188     if (Mask[i] == SM_SentinelUndef)
7189       continue;
7190     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7191       return false;
7192     else if (Mask[i] != ExpectedMask[i])
7193       return false;
7194 
7195   return true;
7196 }
7197 
7198 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7199 ///
7200 /// This helper function produces an 8-bit shuffle immediate corresponding to
7201 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7202 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7203 /// example.
7204 ///
7205 /// NB: We rely heavily on "undef" masks preserving the input lane.
getV4X86ShuffleImm(ArrayRef<int> Mask)7206 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7207   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7208   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7209   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7210   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7211   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7212 
7213   unsigned Imm = 0;
7214   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7215   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7216   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7217   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7218   return Imm;
7219 }
7220 
getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,SDLoc DL,SelectionDAG & DAG)7221 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7222                                           SelectionDAG &DAG) {
7223   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7224 }
7225 
7226 /// \brief Compute whether each element of a shuffle is zeroable.
7227 ///
7228 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7229 /// Either it is an undef element in the shuffle mask, the element of the input
7230 /// referenced is undef, or the element of the input referenced is known to be
7231 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7232 /// as many lanes with this technique as possible to simplify the remaining
7233 /// shuffle.
computeZeroableShuffleElements(ArrayRef<int> Mask,SDValue V1,SDValue V2)7234 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7235                                                      SDValue V1, SDValue V2) {
7236   SmallBitVector Zeroable(Mask.size(), false);
7237   V1 = peekThroughBitcasts(V1);
7238   V2 = peekThroughBitcasts(V2);
7239 
7240   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7241   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7242 
7243   int VectorSizeInBits = V1.getValueType().getSizeInBits();
7244   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7245   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7246 
7247   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7248     int M = Mask[i];
7249     // Handle the easy cases.
7250     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7251       Zeroable[i] = true;
7252       continue;
7253     }
7254 
7255     // Determine shuffle input and normalize the mask.
7256     SDValue V = M < Size ? V1 : V2;
7257     M %= Size;
7258 
7259     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7260     if (V.getOpcode() != ISD::BUILD_VECTOR)
7261       continue;
7262 
7263     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7264     // the (larger) source element must be UNDEF/ZERO.
7265     if ((Size % V.getNumOperands()) == 0) {
7266       int Scale = Size / V->getNumOperands();
7267       SDValue Op = V.getOperand(M / Scale);
7268       if (Op.isUndef() || X86::isZeroNode(Op))
7269         Zeroable[i] = true;
7270       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7271         APInt Val = Cst->getAPIntValue();
7272         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7273         Val = Val.getLoBits(ScalarSizeInBits);
7274         Zeroable[i] = (Val == 0);
7275       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7276         APInt Val = Cst->getValueAPF().bitcastToAPInt();
7277         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7278         Val = Val.getLoBits(ScalarSizeInBits);
7279         Zeroable[i] = (Val == 0);
7280       }
7281       continue;
7282     }
7283 
7284     // If the BUILD_VECTOR has more elements then all the (smaller) source
7285     // elements must be UNDEF or ZERO.
7286     if ((V.getNumOperands() % Size) == 0) {
7287       int Scale = V->getNumOperands() / Size;
7288       bool AllZeroable = true;
7289       for (int j = 0; j < Scale; ++j) {
7290         SDValue Op = V.getOperand((M * Scale) + j);
7291         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
7292       }
7293       Zeroable[i] = AllZeroable;
7294       continue;
7295     }
7296   }
7297 
7298   return Zeroable;
7299 }
7300 
7301 /// Try to lower a shuffle with a single PSHUFB of V1.
7302 /// This is only possible if V2 is unused (at all, or only for zero elements).
lowerVectorShuffleWithPSHUFB(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)7303 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
7304                                             ArrayRef<int> Mask, SDValue V1,
7305                                             SDValue V2,
7306                                             const X86Subtarget &Subtarget,
7307                                             SelectionDAG &DAG) {
7308   int Size = Mask.size();
7309   int LaneSize = 128 / VT.getScalarSizeInBits();
7310   const int NumBytes = VT.getSizeInBits() / 8;
7311   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
7312 
7313   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
7314          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
7315          (Subtarget.hasBWI() && VT.is512BitVector()));
7316 
7317   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7318 
7319   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
7320   // Sign bit set in i8 mask means zero element.
7321   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
7322 
7323   for (int i = 0; i < NumBytes; ++i) {
7324     int M = Mask[i / NumEltBytes];
7325     if (M < 0) {
7326       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
7327       continue;
7328     }
7329     if (Zeroable[i / NumEltBytes]) {
7330       PSHUFBMask[i] = ZeroMask;
7331       continue;
7332     }
7333     // Only allow V1.
7334     if (M >= Size)
7335       return SDValue();
7336 
7337     // PSHUFB can't cross lanes, ensure this doesn't happen.
7338     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
7339       return SDValue();
7340 
7341     M = M % LaneSize;
7342     M = M * NumEltBytes + (i % NumEltBytes);
7343     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
7344   }
7345 
7346   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
7347   return DAG.getBitcast(
7348       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
7349                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
7350 }
7351 
7352 // X86 has dedicated unpack instructions that can handle specific blend
7353 // operations: UNPCKH and UNPCKL.
lowerVectorShuffleWithUNPCK(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)7354 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
7355                                            ArrayRef<int> Mask, SDValue V1,
7356                                            SDValue V2, SelectionDAG &DAG) {
7357   int NumElts = VT.getVectorNumElements();
7358   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7359   SmallVector<int, 8> Unpckl(NumElts);
7360   SmallVector<int, 8> Unpckh(NumElts);
7361 
7362   for (int i = 0; i < NumElts; ++i) {
7363     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7364     int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
7365     int HiPos = LoPos + NumEltsInLane / 2;
7366     Unpckl[i] = LoPos;
7367     Unpckh[i] = HiPos;
7368   }
7369 
7370   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7371     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
7372   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7373     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
7374 
7375   // Commute and try again.
7376   ShuffleVectorSDNode::commuteMask(Unpckl);
7377   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7378     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
7379 
7380   ShuffleVectorSDNode::commuteMask(Unpckh);
7381   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7382     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
7383 
7384   return SDValue();
7385 }
7386 
7387 /// \brief Try to emit a bitmask instruction for a shuffle.
7388 ///
7389 /// This handles cases where we can model a blend exactly as a bitmask due to
7390 /// one of the inputs being zeroable.
lowerVectorShuffleAsBitMask(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7391 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
7392                                            SDValue V2, ArrayRef<int> Mask,
7393                                            SelectionDAG &DAG) {
7394   MVT EltVT = VT.getVectorElementType();
7395   int NumEltBits = EltVT.getSizeInBits();
7396   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7397   SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
7398   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7399                                     IntEltVT);
7400   if (EltVT.isFloatingPoint()) {
7401     Zero = DAG.getBitcast(EltVT, Zero);
7402     AllOnes = DAG.getBitcast(EltVT, AllOnes);
7403   }
7404   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7405   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7406   SDValue V;
7407   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7408     if (Zeroable[i])
7409       continue;
7410     if (Mask[i] % Size != i)
7411       return SDValue(); // Not a blend.
7412     if (!V)
7413       V = Mask[i] < Size ? V1 : V2;
7414     else if (V != (Mask[i] < Size ? V1 : V2))
7415       return SDValue(); // Can only let one input through the mask.
7416 
7417     VMaskOps[i] = AllOnes;
7418   }
7419   if (!V)
7420     return SDValue(); // No non-zeroable elements!
7421 
7422   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
7423   V = DAG.getNode(VT.isFloatingPoint()
7424                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7425                   DL, VT, V, VMask);
7426   return V;
7427 }
7428 
7429 /// \brief Try to emit a blend instruction for a shuffle using bit math.
7430 ///
7431 /// This is used as a fallback approach when first class blend instructions are
7432 /// unavailable. Currently it is only suitable for integer vectors, but could
7433 /// be generalized for floating point vectors if desirable.
lowerVectorShuffleAsBitBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7434 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
7435                                             SDValue V2, ArrayRef<int> Mask,
7436                                             SelectionDAG &DAG) {
7437   assert(VT.isInteger() && "Only supports integer vector types!");
7438   MVT EltVT = VT.getVectorElementType();
7439   int NumEltBits = EltVT.getSizeInBits();
7440   SDValue Zero = DAG.getConstant(0, DL, EltVT);
7441   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7442                                     EltVT);
7443   SmallVector<SDValue, 16> MaskOps;
7444   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7445     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
7446       return SDValue(); // Shuffled input!
7447     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
7448   }
7449 
7450   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
7451   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
7452   // We have to cast V2 around.
7453   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
7454   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
7455                                       DAG.getBitcast(MaskVT, V1Mask),
7456                                       DAG.getBitcast(MaskVT, V2)));
7457   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
7458 }
7459 
7460 /// \brief Try to emit a blend instruction for a shuffle.
7461 ///
7462 /// This doesn't do any checks for the availability of instructions for blending
7463 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7464 /// be matched in the backend with the type given. What it does check for is
7465 /// that the shuffle mask is a blend, or convertible into a blend with zero.
lowerVectorShuffleAsBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Original,const X86Subtarget & Subtarget,SelectionDAG & DAG)7466 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
7467                                          SDValue V2, ArrayRef<int> Original,
7468                                          const X86Subtarget &Subtarget,
7469                                          SelectionDAG &DAG) {
7470   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7471   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7472   SmallVector<int, 8> Mask(Original.begin(), Original.end());
7473   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7474   bool ForceV1Zero = false, ForceV2Zero = false;
7475 
7476   // Attempt to generate the binary blend mask. If an input is zero then
7477   // we can use any lane.
7478   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
7479   unsigned BlendMask = 0;
7480   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7481     int M = Mask[i];
7482     if (M < 0)
7483       continue;
7484     if (M == i)
7485       continue;
7486     if (M == i + Size) {
7487       BlendMask |= 1u << i;
7488       continue;
7489     }
7490     if (Zeroable[i]) {
7491       if (V1IsZero) {
7492         ForceV1Zero = true;
7493         Mask[i] = i;
7494         continue;
7495       }
7496       if (V2IsZero) {
7497         ForceV2Zero = true;
7498         BlendMask |= 1u << i;
7499         Mask[i] = i + Size;
7500         continue;
7501       }
7502     }
7503     return SDValue(); // Shuffled input!
7504   }
7505 
7506   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
7507   if (ForceV1Zero)
7508     V1 = getZeroVector(VT, Subtarget, DAG, DL);
7509   if (ForceV2Zero)
7510     V2 = getZeroVector(VT, Subtarget, DAG, DL);
7511 
7512   auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
7513     unsigned ScaledMask = 0;
7514     for (int i = 0; i != Size; ++i)
7515       if (BlendMask & (1u << i))
7516         for (int j = 0; j != Scale; ++j)
7517           ScaledMask |= 1u << (i * Scale + j);
7518     return ScaledMask;
7519   };
7520 
7521   switch (VT.SimpleTy) {
7522   case MVT::v2f64:
7523   case MVT::v4f32:
7524   case MVT::v4f64:
7525   case MVT::v8f32:
7526     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7527                        DAG.getConstant(BlendMask, DL, MVT::i8));
7528 
7529   case MVT::v4i64:
7530   case MVT::v8i32:
7531     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7532     // FALLTHROUGH
7533   case MVT::v2i64:
7534   case MVT::v4i32:
7535     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7536     // that instruction.
7537     if (Subtarget.hasAVX2()) {
7538       // Scale the blend by the number of 32-bit dwords per element.
7539       int Scale =  VT.getScalarSizeInBits() / 32;
7540       BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7541       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7542       V1 = DAG.getBitcast(BlendVT, V1);
7543       V2 = DAG.getBitcast(BlendVT, V2);
7544       return DAG.getBitcast(
7545           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7546                           DAG.getConstant(BlendMask, DL, MVT::i8)));
7547     }
7548     // FALLTHROUGH
7549   case MVT::v8i16: {
7550     // For integer shuffles we need to expand the mask and cast the inputs to
7551     // v8i16s prior to blending.
7552     int Scale = 8 / VT.getVectorNumElements();
7553     BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7554     V1 = DAG.getBitcast(MVT::v8i16, V1);
7555     V2 = DAG.getBitcast(MVT::v8i16, V2);
7556     return DAG.getBitcast(VT,
7557                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7558                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
7559   }
7560 
7561   case MVT::v16i16: {
7562     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7563     SmallVector<int, 8> RepeatedMask;
7564     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7565       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7566       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7567       BlendMask = 0;
7568       for (int i = 0; i < 8; ++i)
7569         if (RepeatedMask[i] >= 8)
7570           BlendMask |= 1u << i;
7571       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7572                          DAG.getConstant(BlendMask, DL, MVT::i8));
7573     }
7574   }
7575     // FALLTHROUGH
7576   case MVT::v16i8:
7577   case MVT::v32i8: {
7578     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
7579            "256-bit byte-blends require AVX2 support!");
7580 
7581     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
7582     if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
7583       return Masked;
7584 
7585     // Scale the blend by the number of bytes per element.
7586     int Scale = VT.getScalarSizeInBits() / 8;
7587 
7588     // This form of blend is always done on bytes. Compute the byte vector
7589     // type.
7590     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
7591 
7592     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7593     // mix of LLVM's code generator and the x86 backend. We tell the code
7594     // generator that boolean values in the elements of an x86 vector register
7595     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7596     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7597     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7598     // of the element (the remaining are ignored) and 0 in that high bit would
7599     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7600     // the LLVM model for boolean values in vector elements gets the relevant
7601     // bit set, it is set backwards and over constrained relative to x86's
7602     // actual model.
7603     SmallVector<SDValue, 32> VSELECTMask;
7604     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7605       for (int j = 0; j < Scale; ++j)
7606         VSELECTMask.push_back(
7607             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7608                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
7609                                           MVT::i8));
7610 
7611     V1 = DAG.getBitcast(BlendVT, V1);
7612     V2 = DAG.getBitcast(BlendVT, V2);
7613     return DAG.getBitcast(
7614         VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
7615                         DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
7616   }
7617 
7618   default:
7619     llvm_unreachable("Not a supported integer vector type!");
7620   }
7621 }
7622 
7623 /// \brief Try to lower as a blend of elements from two inputs followed by
7624 /// a single-input permutation.
7625 ///
7626 /// This matches the pattern where we can blend elements from two inputs and
7627 /// then reduce the shuffle to a single-input permutation.
lowerVectorShuffleAsBlendAndPermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7628 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
7629                                                    SDValue V1, SDValue V2,
7630                                                    ArrayRef<int> Mask,
7631                                                    SelectionDAG &DAG) {
7632   // We build up the blend mask while checking whether a blend is a viable way
7633   // to reduce the shuffle.
7634   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7635   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
7636 
7637   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7638     if (Mask[i] < 0)
7639       continue;
7640 
7641     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
7642 
7643     if (BlendMask[Mask[i] % Size] < 0)
7644       BlendMask[Mask[i] % Size] = Mask[i];
7645     else if (BlendMask[Mask[i] % Size] != Mask[i])
7646       return SDValue(); // Can't blend in the needed input!
7647 
7648     PermuteMask[i] = Mask[i] % Size;
7649   }
7650 
7651   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7652   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
7653 }
7654 
7655 /// \brief Generic routine to decompose a shuffle and blend into indepndent
7656 /// blends and permutes.
7657 ///
7658 /// This matches the extremely common pattern for handling combined
7659 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7660 /// operations. It will try to pick the best arrangement of shuffles and
7661 /// blends.
lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7662 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
7663                                                           MVT VT, SDValue V1,
7664                                                           SDValue V2,
7665                                                           ArrayRef<int> Mask,
7666                                                           SelectionDAG &DAG) {
7667   // Shuffle the input elements into the desired positions in V1 and V2 and
7668   // blend them together.
7669   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7670   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7671   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7672   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7673     if (Mask[i] >= 0 && Mask[i] < Size) {
7674       V1Mask[i] = Mask[i];
7675       BlendMask[i] = i;
7676     } else if (Mask[i] >= Size) {
7677       V2Mask[i] = Mask[i] - Size;
7678       BlendMask[i] = i + Size;
7679     }
7680 
7681   // Try to lower with the simpler initial blend strategy unless one of the
7682   // input shuffles would be a no-op. We prefer to shuffle inputs as the
7683   // shuffle may be able to fold with a load or other benefit. However, when
7684   // we'll have to do 2x as many shuffles in order to achieve this, blending
7685   // first is a better strategy.
7686   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
7687     if (SDValue BlendPerm =
7688             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
7689       return BlendPerm;
7690 
7691   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7692   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7693   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7694 }
7695 
7696 /// \brief Try to lower a vector shuffle as a byte rotation.
7697 ///
7698 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7699 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7700 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7701 /// try to generically lower a vector shuffle through such an pattern. It
7702 /// does not check for the profitability of lowering either as PALIGNR or
7703 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7704 /// This matches shuffle vectors that look like:
7705 ///
7706 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7707 ///
7708 /// Essentially it concatenates V1 and V2, shifts right by some number of
7709 /// elements, and takes the low elements as the result. Note that while this is
7710 /// specified as a *right shift* because x86 is little-endian, it is a *left
7711 /// rotate* of the vector lanes.
lowerVectorShuffleAsByteRotate(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)7712 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
7713                                               SDValue V1, SDValue V2,
7714                                               ArrayRef<int> Mask,
7715                                               const X86Subtarget &Subtarget,
7716                                               SelectionDAG &DAG) {
7717   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7718 
7719   int NumElts = Mask.size();
7720   int NumLanes = VT.getSizeInBits() / 128;
7721   int NumLaneElts = NumElts / NumLanes;
7722 
7723   // We need to detect various ways of spelling a rotation:
7724   //   [11, 12, 13, 14, 15,  0,  1,  2]
7725   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7726   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7727   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7728   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7729   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7730   int Rotation = 0;
7731   SDValue Lo, Hi;
7732   for (int l = 0; l < NumElts; l += NumLaneElts) {
7733     for (int i = 0; i < NumLaneElts; ++i) {
7734       if (Mask[l + i] < 0)
7735         continue;
7736 
7737       // Get the mod-Size index and lane correct it.
7738       int LaneIdx = (Mask[l + i] % NumElts) - l;
7739       // Make sure it was in this lane.
7740       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
7741         return SDValue();
7742 
7743       // Determine where a rotated vector would have started.
7744       int StartIdx = i - LaneIdx;
7745       if (StartIdx == 0)
7746         // The identity rotation isn't interesting, stop.
7747         return SDValue();
7748 
7749       // If we found the tail of a vector the rotation must be the missing
7750       // front. If we found the head of a vector, it must be how much of the
7751       // head.
7752       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
7753 
7754       if (Rotation == 0)
7755         Rotation = CandidateRotation;
7756       else if (Rotation != CandidateRotation)
7757         // The rotations don't match, so we can't match this mask.
7758         return SDValue();
7759 
7760       // Compute which value this mask is pointing at.
7761       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
7762 
7763       // Compute which of the two target values this index should be assigned
7764       // to. This reflects whether the high elements are remaining or the low
7765       // elements are remaining.
7766       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7767 
7768       // Either set up this value if we've not encountered it before, or check
7769       // that it remains consistent.
7770       if (!TargetV)
7771         TargetV = MaskV;
7772       else if (TargetV != MaskV)
7773         // This may be a rotation, but it pulls from the inputs in some
7774         // unsupported interleaving.
7775         return SDValue();
7776     }
7777   }
7778 
7779   // Check that we successfully analyzed the mask, and normalize the results.
7780   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7781   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7782   if (!Lo)
7783     Lo = Hi;
7784   else if (!Hi)
7785     Hi = Lo;
7786 
7787   // Cast the inputs to i8 vector of correct length to match PALIGNR or
7788   // PSLLDQ/PSRLDQ.
7789   MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
7790   Lo = DAG.getBitcast(ByteVT, Lo);
7791   Hi = DAG.getBitcast(ByteVT, Hi);
7792 
7793   // The actual rotate instruction rotates bytes, so we need to scale the
7794   // rotation based on how many bytes are in the vector lane.
7795   int Scale = 16 / NumLaneElts;
7796 
7797   // SSSE3 targets can use the palignr instruction.
7798   if (Subtarget.hasSSSE3()) {
7799     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
7800            "512-bit PALIGNR requires BWI instructions");
7801     return DAG.getBitcast(
7802         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
7803                         DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
7804   }
7805 
7806   assert(VT.is128BitVector() &&
7807          "Rotate-based lowering only supports 128-bit lowering!");
7808   assert(Mask.size() <= 16 &&
7809          "Can shuffle at most 16 bytes in a 128-bit vector!");
7810   assert(ByteVT == MVT::v16i8 &&
7811          "SSE2 rotate lowering only needed for v16i8!");
7812 
7813   // Default SSE2 implementation
7814   int LoByteShift = 16 - Rotation * Scale;
7815   int HiByteShift = Rotation * Scale;
7816 
7817   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
7818                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
7819   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
7820                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
7821   return DAG.getBitcast(VT,
7822                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
7823 }
7824 
7825 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7826 ///
7827 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
7828 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
7829 /// matches elements from one of the input vectors shuffled to the left or
7830 /// right with zeroable elements 'shifted in'. It handles both the strictly
7831 /// bit-wise element shifts and the byte shift across an entire 128-bit double
7832 /// quad word lane.
7833 ///
7834 /// PSHL : (little-endian) left bit shift.
7835 /// [ zz, 0, zz,  2 ]
7836 /// [ -1, 4, zz, -1 ]
7837 /// PSRL : (little-endian) right bit shift.
7838 /// [  1, zz,  3, zz]
7839 /// [ -1, -1,  7, zz]
7840 /// PSLLDQ : (little-endian) left byte shift
7841 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
7842 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
7843 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
7844 /// PSRLDQ : (little-endian) right byte shift
7845 /// [  5, 6,  7, zz, zz, zz, zz, zz]
7846 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
7847 /// [  1, 2, -1, -1, -1, -1, zz, zz]
lowerVectorShuffleAsShift(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)7848 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
7849                                          SDValue V2, ArrayRef<int> Mask,
7850                                          const X86Subtarget &Subtarget,
7851                                          SelectionDAG &DAG) {
7852   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7853 
7854   int Size = Mask.size();
7855   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7856 
7857   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
7858     for (int i = 0; i < Size; i += Scale)
7859       for (int j = 0; j < Shift; ++j)
7860         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
7861           return false;
7862 
7863     return true;
7864   };
7865 
7866   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
7867     for (int i = 0; i != Size; i += Scale) {
7868       unsigned Pos = Left ? i + Shift : i;
7869       unsigned Low = Left ? i : i + Shift;
7870       unsigned Len = Scale - Shift;
7871       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
7872                                       Low + (V == V1 ? 0 : Size)))
7873         return SDValue();
7874     }
7875 
7876     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
7877     bool ByteShift = ShiftEltBits > 64;
7878     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
7879                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
7880     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
7881 
7882     // Normalize the scale for byte shifts to still produce an i64 element
7883     // type.
7884     Scale = ByteShift ? Scale / 2 : Scale;
7885 
7886     // We need to round trip through the appropriate type for the shift.
7887     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7888     MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
7889                             : MVT::getVectorVT(ShiftSVT, Size / Scale);
7890     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7891            "Illegal integer vector type");
7892     V = DAG.getBitcast(ShiftVT, V);
7893 
7894     V = DAG.getNode(OpCode, DL, ShiftVT, V,
7895                     DAG.getConstant(ShiftAmt, DL, MVT::i8));
7896     return DAG.getBitcast(VT, V);
7897   };
7898 
7899   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7900   // keep doubling the size of the integer elements up to that. We can
7901   // then shift the elements of the integer vector by whole multiples of
7902   // their width within the elements of the larger integer vector. Test each
7903   // multiple to see if we can find a match with the moved element indices
7904   // and that the shifted in elements are all zeroable.
7905   unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
7906   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
7907     for (int Shift = 1; Shift != Scale; ++Shift)
7908       for (bool Left : {true, false})
7909         if (CheckZeros(Shift, Scale, Left))
7910           for (SDValue V : {V1, V2})
7911             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
7912               return Match;
7913 
7914   // no match
7915   return SDValue();
7916 }
7917 
7918 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
lowerVectorShuffleWithSSE4A(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)7919 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
7920                                            SDValue V2, ArrayRef<int> Mask,
7921                                            SelectionDAG &DAG) {
7922   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7923   assert(!Zeroable.all() && "Fully zeroable shuffle mask");
7924 
7925   int Size = Mask.size();
7926   int HalfSize = Size / 2;
7927   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7928 
7929   // Upper half must be undefined.
7930   if (!isUndefInRange(Mask, HalfSize, HalfSize))
7931     return SDValue();
7932 
7933   // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
7934   // Remainder of lower half result is zero and upper half is all undef.
7935   auto LowerAsEXTRQ = [&]() {
7936     // Determine the extraction length from the part of the
7937     // lower half that isn't zeroable.
7938     int Len = HalfSize;
7939     for (; Len > 0; --Len)
7940       if (!Zeroable[Len - 1])
7941         break;
7942     assert(Len > 0 && "Zeroable shuffle mask");
7943 
7944     // Attempt to match first Len sequential elements from the lower half.
7945     SDValue Src;
7946     int Idx = -1;
7947     for (int i = 0; i != Len; ++i) {
7948       int M = Mask[i];
7949       if (M < 0)
7950         continue;
7951       SDValue &V = (M < Size ? V1 : V2);
7952       M = M % Size;
7953 
7954       // The extracted elements must start at a valid index and all mask
7955       // elements must be in the lower half.
7956       if (i > M || M >= HalfSize)
7957         return SDValue();
7958 
7959       if (Idx < 0 || (Src == V && Idx == (M - i))) {
7960         Src = V;
7961         Idx = M - i;
7962         continue;
7963       }
7964       return SDValue();
7965     }
7966 
7967     if (Idx < 0)
7968       return SDValue();
7969 
7970     assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
7971     int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
7972     int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
7973     return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
7974                        DAG.getConstant(BitLen, DL, MVT::i8),
7975                        DAG.getConstant(BitIdx, DL, MVT::i8));
7976   };
7977 
7978   if (SDValue ExtrQ = LowerAsEXTRQ())
7979     return ExtrQ;
7980 
7981   // INSERTQ: Extract lowest Len elements from lower half of second source and
7982   // insert over first source, starting at Idx.
7983   // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
7984   auto LowerAsInsertQ = [&]() {
7985     for (int Idx = 0; Idx != HalfSize; ++Idx) {
7986       SDValue Base;
7987 
7988       // Attempt to match first source from mask before insertion point.
7989       if (isUndefInRange(Mask, 0, Idx)) {
7990         /* EMPTY */
7991       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
7992         Base = V1;
7993       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
7994         Base = V2;
7995       } else {
7996         continue;
7997       }
7998 
7999       // Extend the extraction length looking to match both the insertion of
8000       // the second source and the remaining elements of the first.
8001       for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8002         SDValue Insert;
8003         int Len = Hi - Idx;
8004 
8005         // Match insertion.
8006         if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8007           Insert = V1;
8008         } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8009           Insert = V2;
8010         } else {
8011           continue;
8012         }
8013 
8014         // Match the remaining elements of the lower half.
8015         if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8016           /* EMPTY */
8017         } else if ((!Base || (Base == V1)) &&
8018                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8019           Base = V1;
8020         } else if ((!Base || (Base == V2)) &&
8021                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8022                                               Size + Hi)) {
8023           Base = V2;
8024         } else {
8025           continue;
8026         }
8027 
8028         // We may not have a base (first source) - this can safely be undefined.
8029         if (!Base)
8030           Base = DAG.getUNDEF(VT);
8031 
8032         int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8033         int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8034         return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8035                            DAG.getConstant(BitLen, DL, MVT::i8),
8036                            DAG.getConstant(BitIdx, DL, MVT::i8));
8037       }
8038     }
8039 
8040     return SDValue();
8041   };
8042 
8043   if (SDValue InsertQ = LowerAsInsertQ())
8044     return InsertQ;
8045 
8046   return SDValue();
8047 }
8048 
8049 /// \brief Lower a vector shuffle as a zero or any extension.
8050 ///
8051 /// Given a specific number of elements, element bit width, and extension
8052 /// stride, produce either a zero or any extension based on the available
8053 /// features of the subtarget. The extended elements are consecutive and
8054 /// begin and can start from an offseted element index in the input; to
8055 /// avoid excess shuffling the offset must either being in the bottom lane
8056 /// or at the start of a higher lane. All extended elements must be from
8057 /// the same lane.
lowerVectorShuffleAsSpecificZeroOrAnyExtend(const SDLoc & DL,MVT VT,int Scale,int Offset,bool AnyExt,SDValue InputV,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8058 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8059     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8060     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8061   assert(Scale > 1 && "Need a scale to extend.");
8062   int EltBits = VT.getScalarSizeInBits();
8063   int NumElements = VT.getVectorNumElements();
8064   int NumEltsPerLane = 128 / EltBits;
8065   int OffsetLane = Offset / NumEltsPerLane;
8066   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8067          "Only 8, 16, and 32 bit elements can be extended.");
8068   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8069   assert(0 <= Offset && "Extension offset must be positive.");
8070   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8071          "Extension offset must be in the first lane or start an upper lane.");
8072 
8073   // Check that an index is in same lane as the base offset.
8074   auto SafeOffset = [&](int Idx) {
8075     return OffsetLane == (Idx / NumEltsPerLane);
8076   };
8077 
8078   // Shift along an input so that the offset base moves to the first element.
8079   auto ShuffleOffset = [&](SDValue V) {
8080     if (!Offset)
8081       return V;
8082 
8083     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8084     for (int i = 0; i * Scale < NumElements; ++i) {
8085       int SrcIdx = i + Offset;
8086       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8087     }
8088     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8089   };
8090 
8091   // Found a valid zext mask! Try various lowering strategies based on the
8092   // input type and available ISA extensions.
8093   if (Subtarget.hasSSE41()) {
8094     // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8095     // PUNPCK will catch this in a later shuffle match.
8096     if (Offset && Scale == 2 && VT.is128BitVector())
8097       return SDValue();
8098     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8099                                  NumElements / Scale);
8100     InputV = ShuffleOffset(InputV);
8101 
8102     // For 256-bit vectors, we only need the lower (128-bit) input half.
8103     if (VT.is256BitVector())
8104       InputV = extract128BitVector(InputV, 0, DAG, DL);
8105 
8106     InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8107     return DAG.getBitcast(VT, InputV);
8108   }
8109 
8110   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8111 
8112   // For any extends we can cheat for larger element sizes and use shuffle
8113   // instructions that can fold with a load and/or copy.
8114   if (AnyExt && EltBits == 32) {
8115     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8116                          -1};
8117     return DAG.getBitcast(
8118         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8119                         DAG.getBitcast(MVT::v4i32, InputV),
8120                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8121   }
8122   if (AnyExt && EltBits == 16 && Scale > 2) {
8123     int PSHUFDMask[4] = {Offset / 2, -1,
8124                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8125     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8126                          DAG.getBitcast(MVT::v4i32, InputV),
8127                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8128     int PSHUFWMask[4] = {1, -1, -1, -1};
8129     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8130     return DAG.getBitcast(
8131         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8132                         DAG.getBitcast(MVT::v8i16, InputV),
8133                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8134   }
8135 
8136   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8137   // to 64-bits.
8138   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8139     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8140     assert(VT.is128BitVector() && "Unexpected vector width!");
8141 
8142     int LoIdx = Offset * EltBits;
8143     SDValue Lo = DAG.getBitcast(
8144         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8145                                 DAG.getConstant(EltBits, DL, MVT::i8),
8146                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
8147 
8148     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8149         !SafeOffset(Offset + 1))
8150       return DAG.getBitcast(VT, Lo);
8151 
8152     int HiIdx = (Offset + 1) * EltBits;
8153     SDValue Hi = DAG.getBitcast(
8154         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8155                                 DAG.getConstant(EltBits, DL, MVT::i8),
8156                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
8157     return DAG.getBitcast(VT,
8158                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8159   }
8160 
8161   // If this would require more than 2 unpack instructions to expand, use
8162   // pshufb when available. We can only use more than 2 unpack instructions
8163   // when zero extending i8 elements which also makes it easier to use pshufb.
8164   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8165     assert(NumElements == 16 && "Unexpected byte vector width!");
8166     SDValue PSHUFBMask[16];
8167     for (int i = 0; i < 16; ++i) {
8168       int Idx = Offset + (i / Scale);
8169       PSHUFBMask[i] = DAG.getConstant(
8170           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8171     }
8172     InputV = DAG.getBitcast(MVT::v16i8, InputV);
8173     return DAG.getBitcast(
8174         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8175                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8176   }
8177 
8178   // If we are extending from an offset, ensure we start on a boundary that
8179   // we can unpack from.
8180   int AlignToUnpack = Offset % (NumElements / Scale);
8181   if (AlignToUnpack) {
8182     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8183     for (int i = AlignToUnpack; i < NumElements; ++i)
8184       ShMask[i - AlignToUnpack] = i;
8185     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8186     Offset -= AlignToUnpack;
8187   }
8188 
8189   // Otherwise emit a sequence of unpacks.
8190   do {
8191     unsigned UnpackLoHi = X86ISD::UNPCKL;
8192     if (Offset >= (NumElements / 2)) {
8193       UnpackLoHi = X86ISD::UNPCKH;
8194       Offset -= (NumElements / 2);
8195     }
8196 
8197     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8198     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8199                          : getZeroVector(InputVT, Subtarget, DAG, DL);
8200     InputV = DAG.getBitcast(InputVT, InputV);
8201     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8202     Scale /= 2;
8203     EltBits *= 2;
8204     NumElements /= 2;
8205   } while (Scale > 1);
8206   return DAG.getBitcast(VT, InputV);
8207 }
8208 
8209 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8210 ///
8211 /// This routine will try to do everything in its power to cleverly lower
8212 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
8213 /// check for the profitability of this lowering,  it tries to aggressively
8214 /// match this pattern. It will use all of the micro-architectural details it
8215 /// can to emit an efficient lowering. It handles both blends with all-zero
8216 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8217 /// masking out later).
8218 ///
8219 /// The reason we have dedicated lowering for zext-style shuffles is that they
8220 /// are both incredibly common and often quite performance sensitive.
lowerVectorShuffleAsZeroOrAnyExtend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8221 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8222     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8223     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8224   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8225 
8226   int Bits = VT.getSizeInBits();
8227   int NumLanes = Bits / 128;
8228   int NumElements = VT.getVectorNumElements();
8229   int NumEltsPerLane = NumElements / NumLanes;
8230   assert(VT.getScalarSizeInBits() <= 32 &&
8231          "Exceeds 32-bit integer zero extension limit");
8232   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8233 
8234   // Define a helper function to check a particular ext-scale and lower to it if
8235   // valid.
8236   auto Lower = [&](int Scale) -> SDValue {
8237     SDValue InputV;
8238     bool AnyExt = true;
8239     int Offset = 0;
8240     int Matches = 0;
8241     for (int i = 0; i < NumElements; ++i) {
8242       int M = Mask[i];
8243       if (M < 0)
8244         continue; // Valid anywhere but doesn't tell us anything.
8245       if (i % Scale != 0) {
8246         // Each of the extended elements need to be zeroable.
8247         if (!Zeroable[i])
8248           return SDValue();
8249 
8250         // We no longer are in the anyext case.
8251         AnyExt = false;
8252         continue;
8253       }
8254 
8255       // Each of the base elements needs to be consecutive indices into the
8256       // same input vector.
8257       SDValue V = M < NumElements ? V1 : V2;
8258       M = M % NumElements;
8259       if (!InputV) {
8260         InputV = V;
8261         Offset = M - (i / Scale);
8262       } else if (InputV != V)
8263         return SDValue(); // Flip-flopping inputs.
8264 
8265       // Offset must start in the lowest 128-bit lane or at the start of an
8266       // upper lane.
8267       // FIXME: Is it ever worth allowing a negative base offset?
8268       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
8269             (Offset % NumEltsPerLane) == 0))
8270         return SDValue();
8271 
8272       // If we are offsetting, all referenced entries must come from the same
8273       // lane.
8274       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
8275         return SDValue();
8276 
8277       if ((M % NumElements) != (Offset + (i / Scale)))
8278         return SDValue(); // Non-consecutive strided elements.
8279       Matches++;
8280     }
8281 
8282     // If we fail to find an input, we have a zero-shuffle which should always
8283     // have already been handled.
8284     // FIXME: Maybe handle this here in case during blending we end up with one?
8285     if (!InputV)
8286       return SDValue();
8287 
8288     // If we are offsetting, don't extend if we only match a single input, we
8289     // can always do better by using a basic PSHUF or PUNPCK.
8290     if (Offset != 0 && Matches < 2)
8291       return SDValue();
8292 
8293     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8294         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
8295   };
8296 
8297   // The widest scale possible for extending is to a 64-bit integer.
8298   assert(Bits % 64 == 0 &&
8299          "The number of bits in a vector must be divisible by 64 on x86!");
8300   int NumExtElements = Bits / 64;
8301 
8302   // Each iteration, try extending the elements half as much, but into twice as
8303   // many elements.
8304   for (; NumExtElements < NumElements; NumExtElements *= 2) {
8305     assert(NumElements % NumExtElements == 0 &&
8306            "The input vector size must be divisible by the extended size.");
8307     if (SDValue V = Lower(NumElements / NumExtElements))
8308       return V;
8309   }
8310 
8311   // General extends failed, but 128-bit vectors may be able to use MOVQ.
8312   if (Bits != 128)
8313     return SDValue();
8314 
8315   // Returns one of the source operands if the shuffle can be reduced to a
8316   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8317   auto CanZExtLowHalf = [&]() {
8318     for (int i = NumElements / 2; i != NumElements; ++i)
8319       if (!Zeroable[i])
8320         return SDValue();
8321     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8322       return V1;
8323     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8324       return V2;
8325     return SDValue();
8326   };
8327 
8328   if (SDValue V = CanZExtLowHalf()) {
8329     V = DAG.getBitcast(MVT::v2i64, V);
8330     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8331     return DAG.getBitcast(VT, V);
8332   }
8333 
8334   // No viable ext lowering found.
8335   return SDValue();
8336 }
8337 
8338 /// \brief Try to get a scalar value for a specific element of a vector.
8339 ///
8340 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
getScalarValueForVectorElement(SDValue V,int Idx,SelectionDAG & DAG)8341 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8342                                               SelectionDAG &DAG) {
8343   MVT VT = V.getSimpleValueType();
8344   MVT EltVT = VT.getVectorElementType();
8345   V = peekThroughBitcasts(V);
8346 
8347   // If the bitcasts shift the element size, we can't extract an equivalent
8348   // element from it.
8349   MVT NewVT = V.getSimpleValueType();
8350   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8351     return SDValue();
8352 
8353   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8354       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
8355     // Ensure the scalar operand is the same size as the destination.
8356     // FIXME: Add support for scalar truncation where possible.
8357     SDValue S = V.getOperand(Idx);
8358     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
8359       return DAG.getBitcast(EltVT, S);
8360   }
8361 
8362   return SDValue();
8363 }
8364 
8365 /// \brief Helper to test for a load that can be folded with x86 shuffles.
8366 ///
8367 /// This is particularly important because the set of instructions varies
8368 /// significantly based on whether the operand is a load or not.
isShuffleFoldableLoad(SDValue V)8369 static bool isShuffleFoldableLoad(SDValue V) {
8370   V = peekThroughBitcasts(V);
8371   return ISD::isNON_EXTLoad(V.getNode());
8372 }
8373 
8374 /// \brief Try to lower insertion of a single element into a zero vector.
8375 ///
8376 /// This is a common pattern that we have especially efficient patterns to lower
8377 /// across all subtarget feature sets.
lowerVectorShuffleAsElementInsertion(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8378 static SDValue lowerVectorShuffleAsElementInsertion(
8379     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8380     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8381   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8382   MVT ExtVT = VT;
8383   MVT EltVT = VT.getVectorElementType();
8384 
8385   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8386                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8387                 Mask.begin();
8388   bool IsV1Zeroable = true;
8389   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8390     if (i != V2Index && !Zeroable[i]) {
8391       IsV1Zeroable = false;
8392       break;
8393     }
8394 
8395   // Check for a single input from a SCALAR_TO_VECTOR node.
8396   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8397   // all the smarts here sunk into that routine. However, the current
8398   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8399   // vector shuffle lowering is dead.
8400   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
8401                                                DAG);
8402   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
8403     // We need to zext the scalar if it is smaller than an i32.
8404     V2S = DAG.getBitcast(EltVT, V2S);
8405     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8406       // Using zext to expand a narrow element won't work for non-zero
8407       // insertions.
8408       if (!IsV1Zeroable)
8409         return SDValue();
8410 
8411       // Zero-extend directly to i32.
8412       ExtVT = MVT::v4i32;
8413       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8414     }
8415     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8416   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8417              EltVT == MVT::i16) {
8418     // Either not inserting from the low element of the input or the input
8419     // element size is too small to use VZEXT_MOVL to clear the high bits.
8420     return SDValue();
8421   }
8422 
8423   if (!IsV1Zeroable) {
8424     // If V1 can't be treated as a zero vector we have fewer options to lower
8425     // this. We can't support integer vectors or non-zero targets cheaply, and
8426     // the V1 elements can't be permuted in any way.
8427     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8428     if (!VT.isFloatingPoint() || V2Index != 0)
8429       return SDValue();
8430     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8431     V1Mask[V2Index] = -1;
8432     if (!isNoopShuffleMask(V1Mask))
8433       return SDValue();
8434     // This is essentially a special case blend operation, but if we have
8435     // general purpose blend operations, they are always faster. Bail and let
8436     // the rest of the lowering handle these as blends.
8437     if (Subtarget.hasSSE41())
8438       return SDValue();
8439 
8440     // Otherwise, use MOVSD or MOVSS.
8441     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8442            "Only two types of floating point element types to handle!");
8443     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8444                        ExtVT, V1, V2);
8445   }
8446 
8447   // This lowering only works for the low element with floating point vectors.
8448   if (VT.isFloatingPoint() && V2Index != 0)
8449     return SDValue();
8450 
8451   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8452   if (ExtVT != VT)
8453     V2 = DAG.getBitcast(VT, V2);
8454 
8455   if (V2Index != 0) {
8456     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8457     // the desired position. Otherwise it is more efficient to do a vector
8458     // shift left. We know that we can do a vector shift left because all
8459     // the inputs are zero.
8460     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8461       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8462       V2Shuffle[V2Index] = 0;
8463       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8464     } else {
8465       V2 = DAG.getBitcast(MVT::v16i8, V2);
8466       V2 = DAG.getNode(
8467           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
8468           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
8469                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
8470                               DAG.getDataLayout(), VT)));
8471       V2 = DAG.getBitcast(VT, V2);
8472     }
8473   }
8474   return V2;
8475 }
8476 
8477 /// Try to lower broadcast of a single - truncated - integer element,
8478 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
8479 ///
8480 /// This assumes we have AVX2.
lowerVectorShuffleAsTruncBroadcast(const SDLoc & DL,MVT VT,SDValue V0,int BroadcastIdx,const X86Subtarget & Subtarget,SelectionDAG & DAG)8481 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
8482                                                   SDValue V0, int BroadcastIdx,
8483                                                   const X86Subtarget &Subtarget,
8484                                                   SelectionDAG &DAG) {
8485   assert(Subtarget.hasAVX2() &&
8486          "We can only lower integer broadcasts with AVX2!");
8487 
8488   EVT EltVT = VT.getVectorElementType();
8489   EVT V0VT = V0.getValueType();
8490 
8491   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
8492   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
8493 
8494   EVT V0EltVT = V0VT.getVectorElementType();
8495   if (!V0EltVT.isInteger())
8496     return SDValue();
8497 
8498   const unsigned EltSize = EltVT.getSizeInBits();
8499   const unsigned V0EltSize = V0EltVT.getSizeInBits();
8500 
8501   // This is only a truncation if the original element type is larger.
8502   if (V0EltSize <= EltSize)
8503     return SDValue();
8504 
8505   assert(((V0EltSize % EltSize) == 0) &&
8506          "Scalar type sizes must all be powers of 2 on x86!");
8507 
8508   const unsigned V0Opc = V0.getOpcode();
8509   const unsigned Scale = V0EltSize / EltSize;
8510   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
8511 
8512   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
8513       V0Opc != ISD::BUILD_VECTOR)
8514     return SDValue();
8515 
8516   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
8517 
8518   // If we're extracting non-least-significant bits, shift so we can truncate.
8519   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
8520   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
8521   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
8522   if (const int OffsetIdx = BroadcastIdx % Scale)
8523     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
8524             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
8525 
8526   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
8527                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
8528 }
8529 
8530 /// \brief Try to lower broadcast of a single element.
8531 ///
8532 /// For convenience, this code also bundles all of the subtarget feature set
8533 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8534 /// a convenient way to factor it out.
8535 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
lowerVectorShuffleAsBroadcast(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)8536 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
8537                                              SDValue V1, SDValue V2,
8538                                              ArrayRef<int> Mask,
8539                                              const X86Subtarget &Subtarget,
8540                                              SelectionDAG &DAG) {
8541   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
8542         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
8543         (Subtarget.hasAVX2() && VT.isInteger())))
8544     return SDValue();
8545 
8546   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
8547   // we can only broadcast from a register with AVX2.
8548   unsigned NumElts = Mask.size();
8549   unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
8550   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
8551 
8552   // Check that the mask is a broadcast.
8553   int BroadcastIdx = -1;
8554   for (int i = 0; i != (int)NumElts; ++i) {
8555     SmallVector<int, 8> BroadcastMask(NumElts, i);
8556     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
8557       BroadcastIdx = i;
8558       break;
8559     }
8560   }
8561 
8562   if (BroadcastIdx < 0)
8563     return SDValue();
8564   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8565                                             "a sorted mask where the broadcast "
8566                                             "comes from V1.");
8567 
8568   // Go up the chain of (vector) values to find a scalar load that we can
8569   // combine with the broadcast.
8570   SDValue V = V1;
8571   for (;;) {
8572     switch (V.getOpcode()) {
8573     case ISD::BITCAST: {
8574       SDValue VSrc = V.getOperand(0);
8575       MVT SrcVT = VSrc.getSimpleValueType();
8576       if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
8577         break;
8578       V = VSrc;
8579       continue;
8580     }
8581     case ISD::CONCAT_VECTORS: {
8582       int OperandSize = Mask.size() / V.getNumOperands();
8583       V = V.getOperand(BroadcastIdx / OperandSize);
8584       BroadcastIdx %= OperandSize;
8585       continue;
8586     }
8587     case ISD::INSERT_SUBVECTOR: {
8588       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8589       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8590       if (!ConstantIdx)
8591         break;
8592 
8593       int BeginIdx = (int)ConstantIdx->getZExtValue();
8594       int EndIdx =
8595           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
8596       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8597         BroadcastIdx -= BeginIdx;
8598         V = VInner;
8599       } else {
8600         V = VOuter;
8601       }
8602       continue;
8603     }
8604     }
8605     break;
8606   }
8607 
8608   // Check if this is a broadcast of a scalar. We special case lowering
8609   // for scalars so that we can more effectively fold with loads.
8610   // First, look through bitcast: if the original value has a larger element
8611   // type than the shuffle, the broadcast element is in essence truncated.
8612   // Make that explicit to ease folding.
8613   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
8614     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
8615             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
8616       return TruncBroadcast;
8617 
8618   MVT BroadcastVT = VT;
8619 
8620   // Peek through any bitcast (only useful for loads).
8621   SDValue BC = peekThroughBitcasts(V);
8622 
8623   // Also check the simpler case, where we can directly reuse the scalar.
8624   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8625       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8626     V = V.getOperand(BroadcastIdx);
8627 
8628     // If we can't broadcast from a register, check that the input is a load.
8629     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
8630       return SDValue();
8631   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
8632     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
8633     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
8634       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
8635       Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
8636     }
8637 
8638     // If we are broadcasting a load that is only used by the shuffle
8639     // then we can reduce the vector load to the broadcasted scalar load.
8640     LoadSDNode *Ld = cast<LoadSDNode>(BC);
8641     SDValue BaseAddr = Ld->getOperand(1);
8642     EVT SVT = BroadcastVT.getScalarType();
8643     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
8644     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
8645     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
8646                     DAG.getMachineFunction().getMachineMemOperand(
8647                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
8648   } else if (!BroadcastFromReg) {
8649     // We can't broadcast from a vector register.
8650     return SDValue();
8651   } else if (BroadcastIdx != 0) {
8652     // We can only broadcast from the zero-element of a vector register,
8653     // but it can be advantageous to broadcast from the zero-element of a
8654     // subvector.
8655     if (!VT.is256BitVector() && !VT.is512BitVector())
8656       return SDValue();
8657 
8658     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
8659     if (VT == MVT::v4f64 || VT == MVT::v4i64)
8660       return SDValue();
8661 
8662     // Only broadcast the zero-element of a 128-bit subvector.
8663     unsigned EltSize = VT.getScalarSizeInBits();
8664     if (((BroadcastIdx * EltSize) % 128) != 0)
8665       return SDValue();
8666 
8667     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
8668     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
8669                     DAG.getIntPtrConstant(BroadcastIdx, DL));
8670   }
8671 
8672   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
8673     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
8674                     DAG.getBitcast(MVT::f64, V));
8675 
8676   // Bitcast back to the same scalar type as BroadcastVT.
8677   MVT SrcVT = V.getSimpleValueType();
8678   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
8679     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
8680            "Unexpected vector element size");
8681     if (SrcVT.isVector()) {
8682       unsigned NumSrcElts = SrcVT.getVectorNumElements();
8683       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
8684     } else {
8685       SrcVT = BroadcastVT.getScalarType();
8686     }
8687     V = DAG.getBitcast(SrcVT, V);
8688   }
8689 
8690   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
8691 }
8692 
8693 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8694 // INSERTPS when the V1 elements are already in the correct locations
8695 // because otherwise we can just always use two SHUFPS instructions which
8696 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8697 // perform INSERTPS if a single V1 element is out of place and all V2
8698 // elements are zeroable.
matchVectorShuffleAsInsertPS(SDValue & V1,SDValue & V2,unsigned & InsertPSMask,const SmallBitVector & Zeroable,ArrayRef<int> Mask,SelectionDAG & DAG)8699 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
8700                                          unsigned &InsertPSMask,
8701                                          const SmallBitVector &Zeroable,
8702                                          ArrayRef<int> Mask,
8703                                          SelectionDAG &DAG) {
8704   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
8705   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
8706   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8707   unsigned ZMask = 0;
8708   int V1DstIndex = -1;
8709   int V2DstIndex = -1;
8710   bool V1UsedInPlace = false;
8711 
8712   for (int i = 0; i < 4; ++i) {
8713     // Synthesize a zero mask from the zeroable elements (includes undefs).
8714     if (Zeroable[i]) {
8715       ZMask |= 1 << i;
8716       continue;
8717     }
8718 
8719     // Flag if we use any V1 inputs in place.
8720     if (i == Mask[i]) {
8721       V1UsedInPlace = true;
8722       continue;
8723     }
8724 
8725     // We can only insert a single non-zeroable element.
8726     if (V1DstIndex >= 0 || V2DstIndex >= 0)
8727       return false;
8728 
8729     if (Mask[i] < 4) {
8730       // V1 input out of place for insertion.
8731       V1DstIndex = i;
8732     } else {
8733       // V2 input for insertion.
8734       V2DstIndex = i;
8735     }
8736   }
8737 
8738   // Don't bother if we have no (non-zeroable) element for insertion.
8739   if (V1DstIndex < 0 && V2DstIndex < 0)
8740     return false;
8741 
8742   // Determine element insertion src/dst indices. The src index is from the
8743   // start of the inserted vector, not the start of the concatenated vector.
8744   unsigned V2SrcIndex = 0;
8745   if (V1DstIndex >= 0) {
8746     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8747     // and don't use the original V2 at all.
8748     V2SrcIndex = Mask[V1DstIndex];
8749     V2DstIndex = V1DstIndex;
8750     V2 = V1;
8751   } else {
8752     V2SrcIndex = Mask[V2DstIndex] - 4;
8753   }
8754 
8755   // If no V1 inputs are used in place, then the result is created only from
8756   // the zero mask and the V2 insertion - so remove V1 dependency.
8757   if (!V1UsedInPlace)
8758     V1 = DAG.getUNDEF(MVT::v4f32);
8759 
8760   // Insert the V2 element into the desired position.
8761   InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8762   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8763   return true;
8764 }
8765 
lowerVectorShuffleAsInsertPS(const SDLoc & DL,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)8766 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
8767                                             SDValue V2, ArrayRef<int> Mask,
8768                                             SelectionDAG &DAG) {
8769   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8770   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8771   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8772 
8773   // Attempt to match the insertps pattern.
8774   unsigned InsertPSMask;
8775   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
8776     return SDValue();
8777 
8778   // Insert the V2 element into the desired position.
8779   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8780                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
8781 }
8782 
8783 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
8784 /// UNPCK instruction.
8785 ///
8786 /// This specifically targets cases where we end up with alternating between
8787 /// the two inputs, and so can permute them into something that feeds a single
8788 /// UNPCK instruction. Note that this routine only targets integer vectors
8789 /// because for floating point vectors we have a generalized SHUFPS lowering
8790 /// strategy that handles everything that doesn't *exactly* match an unpack,
8791 /// making this clever lowering unnecessary.
lowerVectorShuffleAsPermuteAndUnpack(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)8792 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
8793                                                     SDValue V1, SDValue V2,
8794                                                     ArrayRef<int> Mask,
8795                                                     SelectionDAG &DAG) {
8796   assert(!VT.isFloatingPoint() &&
8797          "This routine only supports integer vectors.");
8798   assert(VT.is128BitVector() &&
8799          "This routine only works on 128-bit vectors.");
8800   assert(!V2.isUndef() &&
8801          "This routine should only be used when blending two inputs.");
8802   assert(Mask.size() >= 2 && "Single element masks are invalid.");
8803 
8804   int Size = Mask.size();
8805 
8806   int NumLoInputs =
8807       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
8808   int NumHiInputs =
8809       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
8810 
8811   bool UnpackLo = NumLoInputs >= NumHiInputs;
8812 
8813   auto TryUnpack = [&](MVT UnpackVT, int Scale) {
8814     SmallVector<int, 16> V1Mask(Mask.size(), -1);
8815     SmallVector<int, 16> V2Mask(Mask.size(), -1);
8816 
8817     for (int i = 0; i < Size; ++i) {
8818       if (Mask[i] < 0)
8819         continue;
8820 
8821       // Each element of the unpack contains Scale elements from this mask.
8822       int UnpackIdx = i / Scale;
8823 
8824       // We only handle the case where V1 feeds the first slots of the unpack.
8825       // We rely on canonicalization to ensure this is the case.
8826       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
8827         return SDValue();
8828 
8829       // Setup the mask for this input. The indexing is tricky as we have to
8830       // handle the unpack stride.
8831       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
8832       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
8833           Mask[i] % Size;
8834     }
8835 
8836     // If we will have to shuffle both inputs to use the unpack, check whether
8837     // we can just unpack first and shuffle the result. If so, skip this unpack.
8838     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
8839         !isNoopShuffleMask(V2Mask))
8840       return SDValue();
8841 
8842     // Shuffle the inputs into place.
8843     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8844     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8845 
8846     // Cast the inputs to the type we will use to unpack them.
8847     V1 = DAG.getBitcast(UnpackVT, V1);
8848     V2 = DAG.getBitcast(UnpackVT, V2);
8849 
8850     // Unpack the inputs and cast the result back to the desired type.
8851     return DAG.getBitcast(
8852         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
8853                         UnpackVT, V1, V2));
8854   };
8855 
8856   // We try each unpack from the largest to the smallest to try and find one
8857   // that fits this mask.
8858   int OrigNumElements = VT.getVectorNumElements();
8859   int OrigScalarSize = VT.getScalarSizeInBits();
8860   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
8861     int Scale = ScalarSize / OrigScalarSize;
8862     int NumElements = OrigNumElements / Scale;
8863     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
8864     if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
8865       return Unpack;
8866   }
8867 
8868   // If none of the unpack-rooted lowerings worked (or were profitable) try an
8869   // initial unpack.
8870   if (NumLoInputs == 0 || NumHiInputs == 0) {
8871     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
8872            "We have to have *some* inputs!");
8873     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
8874 
8875     // FIXME: We could consider the total complexity of the permute of each
8876     // possible unpacking. Or at the least we should consider how many
8877     // half-crossings are created.
8878     // FIXME: We could consider commuting the unpacks.
8879 
8880     SmallVector<int, 32> PermMask((unsigned)Size, -1);
8881     for (int i = 0; i < Size; ++i) {
8882       if (Mask[i] < 0)
8883         continue;
8884 
8885       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
8886 
8887       PermMask[i] =
8888           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
8889     }
8890     return DAG.getVectorShuffle(
8891         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
8892                             DL, VT, V1, V2),
8893         DAG.getUNDEF(VT), PermMask);
8894   }
8895 
8896   return SDValue();
8897 }
8898 
8899 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8900 ///
8901 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8902 /// support for floating point shuffles but not integer shuffles. These
8903 /// instructions will incur a domain crossing penalty on some chips though so
8904 /// it is better to avoid lowering through this for integer vectors where
8905 /// possible.
lowerV2F64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)8906 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8907                                        SDValue V1, SDValue V2,
8908                                        const X86Subtarget &Subtarget,
8909                                        SelectionDAG &DAG) {
8910   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8911   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8912   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8913 
8914   if (V2.isUndef()) {
8915     // Check for being able to broadcast a single element.
8916     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8917             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8918       return Broadcast;
8919 
8920     // Straight shuffle of a single input vector. Simulate this by using the
8921     // single input as both of the "inputs" to this instruction..
8922     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8923 
8924     if (Subtarget.hasAVX()) {
8925       // If we have AVX, we can use VPERMILPS which will allow folding a load
8926       // into the shuffle.
8927       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8928                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8929     }
8930 
8931     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
8932                        DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8933   }
8934   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8935   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8936 
8937   // If we have a single input, insert that into V1 if we can do so cheaply.
8938   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8939     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8940             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8941       return Insertion;
8942     // Try inverting the insertion since for v2 masks it is easy to do and we
8943     // can't reliably sort the mask one way or the other.
8944     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8945                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8946     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8947             DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
8948       return Insertion;
8949   }
8950 
8951   // Try to use one of the special instruction patterns to handle two common
8952   // blend patterns if a zero-blend above didn't work.
8953   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
8954       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
8955     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8956       // We can either use a special instruction to load over the low double or
8957       // to move just the low double.
8958       return DAG.getNode(
8959           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8960           DL, MVT::v2f64, V2,
8961           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8962 
8963   if (Subtarget.hasSSE41())
8964     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8965                                                   Subtarget, DAG))
8966       return Blend;
8967 
8968   // Use dedicated unpack instructions for masks that match their pattern.
8969   if (SDValue V =
8970           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
8971     return V;
8972 
8973   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8974   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
8975                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8976 }
8977 
8978 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8979 ///
8980 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8981 /// the integer unit to minimize domain crossing penalties. However, for blends
8982 /// it falls back to the floating point shuffle operation with appropriate bit
8983 /// casting.
lowerV2I64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)8984 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8985                                        SDValue V1, SDValue V2,
8986                                        const X86Subtarget &Subtarget,
8987                                        SelectionDAG &DAG) {
8988   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8989   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8990   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8991 
8992   if (V2.isUndef()) {
8993     // Check for being able to broadcast a single element.
8994     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8995             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8996       return Broadcast;
8997 
8998     // Straight shuffle of a single input vector. For everything from SSE2
8999     // onward this has a single fast instruction with no scary immediates.
9000     // We have to map the mask as it is actually a v4i32 shuffle instruction.
9001     V1 = DAG.getBitcast(MVT::v4i32, V1);
9002     int WidenedMask[4] = {
9003         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9004         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9005     return DAG.getBitcast(
9006         MVT::v2i64,
9007         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9008                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9009   }
9010   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9011   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9012   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9013   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9014 
9015   // If we have a blend of two same-type PACKUS operations and the blend aligns
9016   // with the low and high halves, we can just merge the PACKUS operations.
9017   // This is particularly important as it lets us merge shuffles that this
9018   // routine itself creates.
9019   auto GetPackNode = [](SDValue V) {
9020     V = peekThroughBitcasts(V);
9021     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9022   };
9023   if (SDValue V1Pack = GetPackNode(V1))
9024     if (SDValue V2Pack = GetPackNode(V2)) {
9025       EVT PackVT = V1Pack.getValueType();
9026       if (PackVT == V2Pack.getValueType())
9027         return DAG.getBitcast(MVT::v2i64,
9028                               DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9029                                           Mask[0] == 0 ? V1Pack.getOperand(0)
9030                                                        : V1Pack.getOperand(1),
9031                                           Mask[1] == 2 ? V2Pack.getOperand(0)
9032                                                        : V2Pack.getOperand(1)));
9033     }
9034 
9035   // Try to use shift instructions.
9036   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9037                                                 Subtarget, DAG))
9038     return Shift;
9039 
9040   // When loading a scalar and then shuffling it into a vector we can often do
9041   // the insertion cheaply.
9042   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9043           DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9044     return Insertion;
9045   // Try inverting the insertion since for v2 masks it is easy to do and we
9046   // can't reliably sort the mask one way or the other.
9047   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9048   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9049           DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
9050     return Insertion;
9051 
9052   // We have different paths for blend lowering, but they all must use the
9053   // *exact* same predicate.
9054   bool IsBlendSupported = Subtarget.hasSSE41();
9055   if (IsBlendSupported)
9056     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9057                                                   Subtarget, DAG))
9058       return Blend;
9059 
9060   // Use dedicated unpack instructions for masks that match their pattern.
9061   if (SDValue V =
9062           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9063     return V;
9064 
9065   // Try to use byte rotation instructions.
9066   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9067   if (Subtarget.hasSSSE3())
9068     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9069             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9070       return Rotate;
9071 
9072   // If we have direct support for blends, we should lower by decomposing into
9073   // a permute. That will be faster than the domain cross.
9074   if (IsBlendSupported)
9075     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9076                                                       Mask, DAG);
9077 
9078   // We implement this with SHUFPD which is pretty lame because it will likely
9079   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9080   // However, all the alternatives are still more cycles and newer chips don't
9081   // have this problem. It would be really nice if x86 had better shuffles here.
9082   V1 = DAG.getBitcast(MVT::v2f64, V1);
9083   V2 = DAG.getBitcast(MVT::v2f64, V2);
9084   return DAG.getBitcast(MVT::v2i64,
9085                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9086 }
9087 
9088 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9089 ///
9090 /// This is used to disable more specialized lowerings when the shufps lowering
9091 /// will happen to be efficient.
isSingleSHUFPSMask(ArrayRef<int> Mask)9092 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9093   // This routine only handles 128-bit shufps.
9094   assert(Mask.size() == 4 && "Unsupported mask size!");
9095   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9096   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9097   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9098   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9099 
9100   // To lower with a single SHUFPS we need to have the low half and high half
9101   // each requiring a single input.
9102   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9103     return false;
9104   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9105     return false;
9106 
9107   return true;
9108 }
9109 
9110 /// \brief Lower a vector shuffle using the SHUFPS instruction.
9111 ///
9112 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9113 /// It makes no assumptions about whether this is the *best* lowering, it simply
9114 /// uses it.
lowerVectorShuffleWithSHUFPS(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)9115 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9116                                             ArrayRef<int> Mask, SDValue V1,
9117                                             SDValue V2, SelectionDAG &DAG) {
9118   SDValue LowV = V1, HighV = V2;
9119   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9120 
9121   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9122 
9123   if (NumV2Elements == 1) {
9124     int V2Index =
9125         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
9126         Mask.begin();
9127 
9128     // Compute the index adjacent to V2Index and in the same half by toggling
9129     // the low bit.
9130     int V2AdjIndex = V2Index ^ 1;
9131 
9132     if (Mask[V2AdjIndex] < 0) {
9133       // Handles all the cases where we have a single V2 element and an undef.
9134       // This will only ever happen in the high lanes because we commute the
9135       // vector otherwise.
9136       if (V2Index < 2)
9137         std::swap(LowV, HighV);
9138       NewMask[V2Index] -= 4;
9139     } else {
9140       // Handle the case where the V2 element ends up adjacent to a V1 element.
9141       // To make this work, blend them together as the first step.
9142       int V1Index = V2AdjIndex;
9143       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9144       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9145                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9146 
9147       // Now proceed to reconstruct the final blend as we have the necessary
9148       // high or low half formed.
9149       if (V2Index < 2) {
9150         LowV = V2;
9151         HighV = V1;
9152       } else {
9153         HighV = V2;
9154       }
9155       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9156       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9157     }
9158   } else if (NumV2Elements == 2) {
9159     if (Mask[0] < 4 && Mask[1] < 4) {
9160       // Handle the easy case where we have V1 in the low lanes and V2 in the
9161       // high lanes.
9162       NewMask[2] -= 4;
9163       NewMask[3] -= 4;
9164     } else if (Mask[2] < 4 && Mask[3] < 4) {
9165       // We also handle the reversed case because this utility may get called
9166       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9167       // arrange things in the right direction.
9168       NewMask[0] -= 4;
9169       NewMask[1] -= 4;
9170       HighV = V1;
9171       LowV = V2;
9172     } else {
9173       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
9174       // trying to place elements directly, just blend them and set up the final
9175       // shuffle to place them.
9176 
9177       // The first two blend mask elements are for V1, the second two are for
9178       // V2.
9179       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
9180                           Mask[2] < 4 ? Mask[2] : Mask[3],
9181                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
9182                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
9183       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
9184                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9185 
9186       // Now we do a normal shuffle of V1 by giving V1 as both operands to
9187       // a blend.
9188       LowV = HighV = V1;
9189       NewMask[0] = Mask[0] < 4 ? 0 : 2;
9190       NewMask[1] = Mask[0] < 4 ? 2 : 0;
9191       NewMask[2] = Mask[2] < 4 ? 1 : 3;
9192       NewMask[3] = Mask[2] < 4 ? 3 : 1;
9193     }
9194   }
9195   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
9196                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
9197 }
9198 
9199 /// \brief Lower 4-lane 32-bit floating point shuffles.
9200 ///
9201 /// Uses instructions exclusively from the floating point unit to minimize
9202 /// domain crossing penalties, as these are sufficient to implement all v4f32
9203 /// shuffles.
lowerV4F32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)9204 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9205                                        SDValue V1, SDValue V2,
9206                                        const X86Subtarget &Subtarget,
9207                                        SelectionDAG &DAG) {
9208   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9209   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9210   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9211 
9212   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9213 
9214   if (NumV2Elements == 0) {
9215     // Check for being able to broadcast a single element.
9216     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9217             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
9218       return Broadcast;
9219 
9220     // Use even/odd duplicate instructions for masks that match their pattern.
9221     if (Subtarget.hasSSE3()) {
9222       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
9223         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
9224       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
9225         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
9226     }
9227 
9228     if (Subtarget.hasAVX()) {
9229       // If we have AVX, we can use VPERMILPS which will allow folding a load
9230       // into the shuffle.
9231       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
9232                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9233     }
9234 
9235     // Otherwise, use a straight shuffle of a single input vector. We pass the
9236     // input vector to both operands to simulate this with a SHUFPS.
9237     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
9238                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9239   }
9240 
9241   // There are special ways we can lower some single-element blends. However, we
9242   // have custom ways we can lower more complex single-element blends below that
9243   // we defer to if both this and BLENDPS fail to match, so restrict this to
9244   // when the V2 input is targeting element 0 of the mask -- that is the fast
9245   // case here.
9246   if (NumV2Elements == 1 && Mask[0] >= 4)
9247     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
9248                                                          Mask, Subtarget, DAG))
9249       return V;
9250 
9251   if (Subtarget.hasSSE41()) {
9252     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
9253                                                   Subtarget, DAG))
9254       return Blend;
9255 
9256     // Use INSERTPS if we can complete the shuffle efficiently.
9257     if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
9258       return V;
9259 
9260     if (!isSingleSHUFPSMask(Mask))
9261       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
9262               DL, MVT::v4f32, V1, V2, Mask, DAG))
9263         return BlendPerm;
9264   }
9265 
9266   // Use low/high mov instructions.
9267   if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
9268     return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
9269   if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
9270     return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
9271 
9272   // Use dedicated unpack instructions for masks that match their pattern.
9273   if (SDValue V =
9274           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
9275     return V;
9276 
9277   // Otherwise fall back to a SHUFPS lowering strategy.
9278   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
9279 }
9280 
9281 /// \brief Lower 4-lane i32 vector shuffles.
9282 ///
9283 /// We try to handle these with integer-domain shuffles where we can, but for
9284 /// blends we use the floating point domain blend instructions.
lowerV4I32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)9285 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9286                                        SDValue V1, SDValue V2,
9287                                        const X86Subtarget &Subtarget,
9288                                        SelectionDAG &DAG) {
9289   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9290   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9291   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9292 
9293   // Whenever we can lower this as a zext, that instruction is strictly faster
9294   // than any alternative. It also allows us to fold memory operands into the
9295   // shuffle in many cases.
9296   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
9297                                                          Mask, Subtarget, DAG))
9298     return ZExt;
9299 
9300   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9301 
9302   if (NumV2Elements == 0) {
9303     // Check for being able to broadcast a single element.
9304     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9305             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9306       return Broadcast;
9307 
9308     // Straight shuffle of a single input vector. For everything from SSE2
9309     // onward this has a single fast instruction with no scary immediates.
9310     // We coerce the shuffle pattern to be compatible with UNPCK instructions
9311     // but we aren't actually going to use the UNPCK instruction because doing
9312     // so prevents folding a load into this instruction or making a copy.
9313     const int UnpackLoMask[] = {0, 0, 1, 1};
9314     const int UnpackHiMask[] = {2, 2, 3, 3};
9315     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
9316       Mask = UnpackLoMask;
9317     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
9318       Mask = UnpackHiMask;
9319 
9320     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9321                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9322   }
9323 
9324   // Try to use shift instructions.
9325   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
9326                                                 Subtarget, DAG))
9327     return Shift;
9328 
9329   // There are special ways we can lower some single-element blends.
9330   if (NumV2Elements == 1)
9331     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
9332                                                          Mask, Subtarget, DAG))
9333       return V;
9334 
9335   // We have different paths for blend lowering, but they all must use the
9336   // *exact* same predicate.
9337   bool IsBlendSupported = Subtarget.hasSSE41();
9338   if (IsBlendSupported)
9339     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
9340                                                   Subtarget, DAG))
9341       return Blend;
9342 
9343   if (SDValue Masked =
9344           lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
9345     return Masked;
9346 
9347   // Use dedicated unpack instructions for masks that match their pattern.
9348   if (SDValue V =
9349           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
9350     return V;
9351 
9352   // Try to use byte rotation instructions.
9353   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9354   if (Subtarget.hasSSSE3())
9355     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9356             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9357       return Rotate;
9358 
9359   // If we have direct support for blends, we should lower by decomposing into
9360   // a permute. That will be faster than the domain cross.
9361   if (IsBlendSupported)
9362     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
9363                                                       Mask, DAG);
9364 
9365   // Try to lower by permuting the inputs into an unpack instruction.
9366   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
9367                                                             V2, Mask, DAG))
9368     return Unpack;
9369 
9370   // We implement this with SHUFPS because it can blend from two vectors.
9371   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
9372   // up the inputs, bypassing domain shift penalties that we would encur if we
9373   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
9374   // relevant.
9375   return DAG.getBitcast(
9376       MVT::v4i32,
9377       DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
9378                            DAG.getBitcast(MVT::v4f32, V2), Mask));
9379 }
9380 
9381 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
9382 /// shuffle lowering, and the most complex part.
9383 ///
9384 /// The lowering strategy is to try to form pairs of input lanes which are
9385 /// targeted at the same half of the final vector, and then use a dword shuffle
9386 /// to place them onto the right half, and finally unpack the paired lanes into
9387 /// their final position.
9388 ///
9389 /// The exact breakdown of how to form these dword pairs and align them on the
9390 /// correct sides is really tricky. See the comments within the function for
9391 /// more of the details.
9392 ///
9393 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
9394 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
9395 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
9396 /// vector, form the analogous 128-bit 8-element Mask.
lowerV8I16GeneralSingleInputVectorShuffle(const SDLoc & DL,MVT VT,SDValue V,MutableArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)9397 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
9398     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
9399     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9400   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
9401   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
9402 
9403   assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
9404   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
9405   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
9406 
9407   SmallVector<int, 4> LoInputs;
9408   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
9409                [](int M) { return M >= 0; });
9410   std::sort(LoInputs.begin(), LoInputs.end());
9411   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
9412   SmallVector<int, 4> HiInputs;
9413   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
9414                [](int M) { return M >= 0; });
9415   std::sort(HiInputs.begin(), HiInputs.end());
9416   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
9417   int NumLToL =
9418       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
9419   int NumHToL = LoInputs.size() - NumLToL;
9420   int NumLToH =
9421       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
9422   int NumHToH = HiInputs.size() - NumLToH;
9423   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
9424   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
9425   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
9426   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
9427 
9428   // If we are splatting two values from one half - one to each half, then
9429   // we can shuffle that half so each is splatted to a dword, then splat those
9430   // to their respective halves.
9431   auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
9432                         int DOffset) {
9433     int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
9434     int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
9435     V = DAG.getNode(ShufWOp, DL, VT, V,
9436                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9437     V = DAG.getBitcast(PSHUFDVT, V);
9438     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
9439                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9440     return DAG.getBitcast(VT, V);
9441   };
9442 
9443   if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
9444     return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
9445   if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
9446     return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
9447 
9448   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
9449   // such inputs we can swap two of the dwords across the half mark and end up
9450   // with <=2 inputs to each half in each half. Once there, we can fall through
9451   // to the generic code below. For example:
9452   //
9453   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9454   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
9455   //
9456   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
9457   // and an existing 2-into-2 on the other half. In this case we may have to
9458   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
9459   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
9460   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
9461   // because any other situation (including a 3-into-1 or 1-into-3 in the other
9462   // half than the one we target for fixing) will be fixed when we re-enter this
9463   // path. We will also combine away any sequence of PSHUFD instructions that
9464   // result into a single instruction. Here is an example of the tricky case:
9465   //
9466   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9467   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
9468   //
9469   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
9470   //
9471   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
9472   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
9473   //
9474   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
9475   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
9476   //
9477   // The result is fine to be handled by the generic logic.
9478   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
9479                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
9480                           int AOffset, int BOffset) {
9481     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
9482            "Must call this with A having 3 or 1 inputs from the A half.");
9483     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
9484            "Must call this with B having 1 or 3 inputs from the B half.");
9485     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
9486            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
9487 
9488     bool ThreeAInputs = AToAInputs.size() == 3;
9489 
9490     // Compute the index of dword with only one word among the three inputs in
9491     // a half by taking the sum of the half with three inputs and subtracting
9492     // the sum of the actual three inputs. The difference is the remaining
9493     // slot.
9494     int ADWord, BDWord;
9495     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
9496     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
9497     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
9498     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
9499     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
9500     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
9501     int TripleNonInputIdx =
9502         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
9503     TripleDWord = TripleNonInputIdx / 2;
9504 
9505     // We use xor with one to compute the adjacent DWord to whichever one the
9506     // OneInput is in.
9507     OneInputDWord = (OneInput / 2) ^ 1;
9508 
9509     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
9510     // and BToA inputs. If there is also such a problem with the BToB and AToB
9511     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
9512     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
9513     // is essential that we don't *create* a 3<-1 as then we might oscillate.
9514     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
9515       // Compute how many inputs will be flipped by swapping these DWords. We
9516       // need
9517       // to balance this to ensure we don't form a 3-1 shuffle in the other
9518       // half.
9519       int NumFlippedAToBInputs =
9520           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
9521           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
9522       int NumFlippedBToBInputs =
9523           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
9524           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
9525       if ((NumFlippedAToBInputs == 1 &&
9526            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
9527           (NumFlippedBToBInputs == 1 &&
9528            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
9529         // We choose whether to fix the A half or B half based on whether that
9530         // half has zero flipped inputs. At zero, we may not be able to fix it
9531         // with that half. We also bias towards fixing the B half because that
9532         // will more commonly be the high half, and we have to bias one way.
9533         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
9534                                                        ArrayRef<int> Inputs) {
9535           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
9536           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
9537                                          PinnedIdx ^ 1) != Inputs.end();
9538           // Determine whether the free index is in the flipped dword or the
9539           // unflipped dword based on where the pinned index is. We use this bit
9540           // in an xor to conditionally select the adjacent dword.
9541           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
9542           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9543                                              FixFreeIdx) != Inputs.end();
9544           if (IsFixIdxInput == IsFixFreeIdxInput)
9545             FixFreeIdx += 1;
9546           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9547                                         FixFreeIdx) != Inputs.end();
9548           assert(IsFixIdxInput != IsFixFreeIdxInput &&
9549                  "We need to be changing the number of flipped inputs!");
9550           int PSHUFHalfMask[] = {0, 1, 2, 3};
9551           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
9552           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
9553                           MVT::v8i16, V,
9554                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9555 
9556           for (int &M : Mask)
9557             if (M >= 0 && M == FixIdx)
9558               M = FixFreeIdx;
9559             else if (M >= 0 && M == FixFreeIdx)
9560               M = FixIdx;
9561         };
9562         if (NumFlippedBToBInputs != 0) {
9563           int BPinnedIdx =
9564               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9565           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9566         } else {
9567           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9568           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
9569           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9570         }
9571       }
9572     }
9573 
9574     int PSHUFDMask[] = {0, 1, 2, 3};
9575     PSHUFDMask[ADWord] = BDWord;
9576     PSHUFDMask[BDWord] = ADWord;
9577     V = DAG.getBitcast(
9578         VT,
9579         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9580                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9581 
9582     // Adjust the mask to match the new locations of A and B.
9583     for (int &M : Mask)
9584       if (M >= 0 && M/2 == ADWord)
9585         M = 2 * BDWord + M % 2;
9586       else if (M >= 0 && M/2 == BDWord)
9587         M = 2 * ADWord + M % 2;
9588 
9589     // Recurse back into this routine to re-compute state now that this isn't
9590     // a 3 and 1 problem.
9591     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
9592                                                      DAG);
9593   };
9594   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9595     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9596   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9597     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9598 
9599   // At this point there are at most two inputs to the low and high halves from
9600   // each half. That means the inputs can always be grouped into dwords and
9601   // those dwords can then be moved to the correct half with a dword shuffle.
9602   // We use at most one low and one high word shuffle to collect these paired
9603   // inputs into dwords, and finally a dword shuffle to place them.
9604   int PSHUFLMask[4] = {-1, -1, -1, -1};
9605   int PSHUFHMask[4] = {-1, -1, -1, -1};
9606   int PSHUFDMask[4] = {-1, -1, -1, -1};
9607 
9608   // First fix the masks for all the inputs that are staying in their
9609   // original halves. This will then dictate the targets of the cross-half
9610   // shuffles.
9611   auto fixInPlaceInputs =
9612       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9613                     MutableArrayRef<int> SourceHalfMask,
9614                     MutableArrayRef<int> HalfMask, int HalfOffset) {
9615     if (InPlaceInputs.empty())
9616       return;
9617     if (InPlaceInputs.size() == 1) {
9618       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9619           InPlaceInputs[0] - HalfOffset;
9620       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9621       return;
9622     }
9623     if (IncomingInputs.empty()) {
9624       // Just fix all of the in place inputs.
9625       for (int Input : InPlaceInputs) {
9626         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9627         PSHUFDMask[Input / 2] = Input / 2;
9628       }
9629       return;
9630     }
9631 
9632     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9633     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9634         InPlaceInputs[0] - HalfOffset;
9635     // Put the second input next to the first so that they are packed into
9636     // a dword. We find the adjacent index by toggling the low bit.
9637     int AdjIndex = InPlaceInputs[0] ^ 1;
9638     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9639     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9640     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9641   };
9642   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9643   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9644 
9645   // Now gather the cross-half inputs and place them into a free dword of
9646   // their target half.
9647   // FIXME: This operation could almost certainly be simplified dramatically to
9648   // look more like the 3-1 fixing operation.
9649   auto moveInputsToRightHalf = [&PSHUFDMask](
9650       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9651       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9652       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9653       int DestOffset) {
9654     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9655       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
9656     };
9657     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9658                                                int Word) {
9659       int LowWord = Word & ~1;
9660       int HighWord = Word | 1;
9661       return isWordClobbered(SourceHalfMask, LowWord) ||
9662              isWordClobbered(SourceHalfMask, HighWord);
9663     };
9664 
9665     if (IncomingInputs.empty())
9666       return;
9667 
9668     if (ExistingInputs.empty()) {
9669       // Map any dwords with inputs from them into the right half.
9670       for (int Input : IncomingInputs) {
9671         // If the source half mask maps over the inputs, turn those into
9672         // swaps and use the swapped lane.
9673         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9674           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
9675             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9676                 Input - SourceOffset;
9677             // We have to swap the uses in our half mask in one sweep.
9678             for (int &M : HalfMask)
9679               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9680                 M = Input;
9681               else if (M == Input)
9682                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9683           } else {
9684             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9685                        Input - SourceOffset &&
9686                    "Previous placement doesn't match!");
9687           }
9688           // Note that this correctly re-maps both when we do a swap and when
9689           // we observe the other side of the swap above. We rely on that to
9690           // avoid swapping the members of the input list directly.
9691           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9692         }
9693 
9694         // Map the input's dword into the correct half.
9695         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
9696           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9697         else
9698           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9699                      Input / 2 &&
9700                  "Previous placement doesn't match!");
9701       }
9702 
9703       // And just directly shift any other-half mask elements to be same-half
9704       // as we will have mirrored the dword containing the element into the
9705       // same position within that half.
9706       for (int &M : HalfMask)
9707         if (M >= SourceOffset && M < SourceOffset + 4) {
9708           M = M - SourceOffset + DestOffset;
9709           assert(M >= 0 && "This should never wrap below zero!");
9710         }
9711       return;
9712     }
9713 
9714     // Ensure we have the input in a viable dword of its current half. This
9715     // is particularly tricky because the original position may be clobbered
9716     // by inputs being moved and *staying* in that half.
9717     if (IncomingInputs.size() == 1) {
9718       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9719         int InputFixed = std::find(std::begin(SourceHalfMask),
9720                                    std::end(SourceHalfMask), -1) -
9721                          std::begin(SourceHalfMask) + SourceOffset;
9722         SourceHalfMask[InputFixed - SourceOffset] =
9723             IncomingInputs[0] - SourceOffset;
9724         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9725                      InputFixed);
9726         IncomingInputs[0] = InputFixed;
9727       }
9728     } else if (IncomingInputs.size() == 2) {
9729       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9730           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9731         // We have two non-adjacent or clobbered inputs we need to extract from
9732         // the source half. To do this, we need to map them into some adjacent
9733         // dword slot in the source mask.
9734         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9735                               IncomingInputs[1] - SourceOffset};
9736 
9737         // If there is a free slot in the source half mask adjacent to one of
9738         // the inputs, place the other input in it. We use (Index XOR 1) to
9739         // compute an adjacent index.
9740         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9741             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
9742           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9743           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9744           InputsFixed[1] = InputsFixed[0] ^ 1;
9745         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9746                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
9747           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9748           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9749           InputsFixed[0] = InputsFixed[1] ^ 1;
9750         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
9751                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
9752           // The two inputs are in the same DWord but it is clobbered and the
9753           // adjacent DWord isn't used at all. Move both inputs to the free
9754           // slot.
9755           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9756           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9757           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9758           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9759         } else {
9760           // The only way we hit this point is if there is no clobbering
9761           // (because there are no off-half inputs to this half) and there is no
9762           // free slot adjacent to one of the inputs. In this case, we have to
9763           // swap an input with a non-input.
9764           for (int i = 0; i < 4; ++i)
9765             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
9766                    "We can't handle any clobbers here!");
9767           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9768                  "Cannot have adjacent inputs here!");
9769 
9770           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9771           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9772 
9773           // We also have to update the final source mask in this case because
9774           // it may need to undo the above swap.
9775           for (int &M : FinalSourceHalfMask)
9776             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9777               M = InputsFixed[1] + SourceOffset;
9778             else if (M == InputsFixed[1] + SourceOffset)
9779               M = (InputsFixed[0] ^ 1) + SourceOffset;
9780 
9781           InputsFixed[1] = InputsFixed[0] ^ 1;
9782         }
9783 
9784         // Point everything at the fixed inputs.
9785         for (int &M : HalfMask)
9786           if (M == IncomingInputs[0])
9787             M = InputsFixed[0] + SourceOffset;
9788           else if (M == IncomingInputs[1])
9789             M = InputsFixed[1] + SourceOffset;
9790 
9791         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9792         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9793       }
9794     } else {
9795       llvm_unreachable("Unhandled input size!");
9796     }
9797 
9798     // Now hoist the DWord down to the right half.
9799     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
9800     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
9801     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9802     for (int &M : HalfMask)
9803       for (int Input : IncomingInputs)
9804         if (M == Input)
9805           M = FreeDWord * 2 + Input % 2;
9806   };
9807   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9808                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9809   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9810                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9811 
9812   // Now enact all the shuffles we've computed to move the inputs into their
9813   // target half.
9814   if (!isNoopShuffleMask(PSHUFLMask))
9815     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9816                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
9817   if (!isNoopShuffleMask(PSHUFHMask))
9818     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9819                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
9820   if (!isNoopShuffleMask(PSHUFDMask))
9821     V = DAG.getBitcast(
9822         VT,
9823         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9824                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9825 
9826   // At this point, each half should contain all its inputs, and we can then
9827   // just shuffle them into their final position.
9828   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
9829          "Failed to lift all the high half inputs to the low mask!");
9830   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
9831          "Failed to lift all the low half inputs to the high mask!");
9832 
9833   // Do a half shuffle for the low mask.
9834   if (!isNoopShuffleMask(LoMask))
9835     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9836                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
9837 
9838   // Do a half shuffle with the high mask after shifting its values down.
9839   for (int &M : HiMask)
9840     if (M >= 0)
9841       M -= 4;
9842   if (!isNoopShuffleMask(HiMask))
9843     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9844                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
9845 
9846   return V;
9847 }
9848 
9849 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
9850 /// blend if only one input is used.
lowerVectorShuffleAsBlendOfPSHUFBs(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG,bool & V1InUse,bool & V2InUse)9851 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
9852     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9853     SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
9854   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9855   SDValue V1Mask[16];
9856   SDValue V2Mask[16];
9857   V1InUse = false;
9858   V2InUse = false;
9859 
9860   int Size = Mask.size();
9861   int Scale = 16 / Size;
9862   for (int i = 0; i < 16; ++i) {
9863     if (Mask[i / Scale] < 0) {
9864       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9865     } else {
9866       const int ZeroMask = 0x80;
9867       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
9868                                           : ZeroMask;
9869       int V2Idx = Mask[i / Scale] < Size
9870                       ? ZeroMask
9871                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
9872       if (Zeroable[i / Scale])
9873         V1Idx = V2Idx = ZeroMask;
9874       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
9875       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
9876       V1InUse |= (ZeroMask != V1Idx);
9877       V2InUse |= (ZeroMask != V2Idx);
9878     }
9879   }
9880 
9881   if (V1InUse)
9882     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9883                      DAG.getBitcast(MVT::v16i8, V1),
9884                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
9885   if (V2InUse)
9886     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9887                      DAG.getBitcast(MVT::v16i8, V2),
9888                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
9889 
9890   // If we need shuffled inputs from both, blend the two.
9891   SDValue V;
9892   if (V1InUse && V2InUse)
9893     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9894   else
9895     V = V1InUse ? V1 : V2;
9896 
9897   // Cast the result back to the correct type.
9898   return DAG.getBitcast(VT, V);
9899 }
9900 
9901 /// \brief Generic lowering of 8-lane i16 shuffles.
9902 ///
9903 /// This handles both single-input shuffles and combined shuffle/blends with
9904 /// two inputs. The single input shuffles are immediately delegated to
9905 /// a dedicated lowering routine.
9906 ///
9907 /// The blends are lowered in one of three fundamental ways. If there are few
9908 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9909 /// of the input is significantly cheaper when lowered as an interleaving of
9910 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9911 /// halves of the inputs separately (making them have relatively few inputs)
9912 /// and then concatenate them.
lowerV8I16VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)9913 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9914                                        SDValue V1, SDValue V2,
9915                                        const X86Subtarget &Subtarget,
9916                                        SelectionDAG &DAG) {
9917   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9918   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9919   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9920 
9921   // Whenever we can lower this as a zext, that instruction is strictly faster
9922   // than any alternative.
9923   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9924           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9925     return ZExt;
9926 
9927   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
9928 
9929   if (NumV2Inputs == 0) {
9930     // Check for being able to broadcast a single element.
9931     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9932             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9933       return Broadcast;
9934 
9935     // Try to use shift instructions.
9936     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
9937                                                   Subtarget, DAG))
9938       return Shift;
9939 
9940     // Use dedicated unpack instructions for masks that match their pattern.
9941     if (SDValue V =
9942             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9943       return V;
9944 
9945     // Try to use byte rotation instructions.
9946     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
9947                                                         Mask, Subtarget, DAG))
9948       return Rotate;
9949 
9950     // Make a copy of the mask so it can be modified.
9951     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
9952     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
9953                                                      MutableMask, Subtarget,
9954                                                      DAG);
9955   }
9956 
9957   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
9958          "All single-input shuffles should be canonicalized to be V1-input "
9959          "shuffles.");
9960 
9961   // Try to use shift instructions.
9962   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
9963                                                 Subtarget, DAG))
9964     return Shift;
9965 
9966   // See if we can use SSE4A Extraction / Insertion.
9967   if (Subtarget.hasSSE4A())
9968     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
9969       return V;
9970 
9971   // There are special ways we can lower some single-element blends.
9972   if (NumV2Inputs == 1)
9973     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
9974                                                          Mask, Subtarget, DAG))
9975       return V;
9976 
9977   // We have different paths for blend lowering, but they all must use the
9978   // *exact* same predicate.
9979   bool IsBlendSupported = Subtarget.hasSSE41();
9980   if (IsBlendSupported)
9981     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9982                                                   Subtarget, DAG))
9983       return Blend;
9984 
9985   if (SDValue Masked =
9986           lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9987     return Masked;
9988 
9989   // Use dedicated unpack instructions for masks that match their pattern.
9990   if (SDValue V =
9991           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9992     return V;
9993 
9994   // Try to use byte rotation instructions.
9995   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9996           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9997     return Rotate;
9998 
9999   if (SDValue BitBlend =
10000           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10001     return BitBlend;
10002 
10003   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10004                                                             V2, Mask, DAG))
10005     return Unpack;
10006 
10007   // If we can't directly blend but can use PSHUFB, that will be better as it
10008   // can both shuffle and set up the inefficient blend.
10009   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10010     bool V1InUse, V2InUse;
10011     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
10012                                               V1InUse, V2InUse);
10013   }
10014 
10015   // We can always bit-blend if we have to so the fallback strategy is to
10016   // decompose into single-input permutes and blends.
10017   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10018                                                       Mask, DAG);
10019 }
10020 
10021 /// \brief Check whether a compaction lowering can be done by dropping even
10022 /// elements and compute how many times even elements must be dropped.
10023 ///
10024 /// This handles shuffles which take every Nth element where N is a power of
10025 /// two. Example shuffle masks:
10026 ///
10027 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
10028 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10029 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
10030 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
10031 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
10032 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
10033 ///
10034 /// Any of these lanes can of course be undef.
10035 ///
10036 /// This routine only supports N <= 3.
10037 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10038 /// for larger N.
10039 ///
10040 /// \returns N above, or the number of times even elements must be dropped if
10041 /// there is such a number. Otherwise returns zero.
canLowerByDroppingEvenElements(ArrayRef<int> Mask,bool IsSingleInput)10042 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10043                                           bool IsSingleInput) {
10044   // The modulus for the shuffle vector entries is based on whether this is
10045   // a single input or not.
10046   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10047   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10048          "We should only be called with masks with a power-of-2 size!");
10049 
10050   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10051 
10052   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10053   // and 2^3 simultaneously. This is because we may have ambiguity with
10054   // partially undef inputs.
10055   bool ViableForN[3] = {true, true, true};
10056 
10057   for (int i = 0, e = Mask.size(); i < e; ++i) {
10058     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10059     // want.
10060     if (Mask[i] < 0)
10061       continue;
10062 
10063     bool IsAnyViable = false;
10064     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10065       if (ViableForN[j]) {
10066         uint64_t N = j + 1;
10067 
10068         // The shuffle mask must be equal to (i * 2^N) % M.
10069         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10070           IsAnyViable = true;
10071         else
10072           ViableForN[j] = false;
10073       }
10074     // Early exit if we exhaust the possible powers of two.
10075     if (!IsAnyViable)
10076       break;
10077   }
10078 
10079   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10080     if (ViableForN[j])
10081       return j + 1;
10082 
10083   // Return 0 as there is no viable power of two.
10084   return 0;
10085 }
10086 
10087 /// \brief Generic lowering of v16i8 shuffles.
10088 ///
10089 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10090 /// detect any complexity reducing interleaving. If that doesn't help, it uses
10091 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10092 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10093 /// back together.
lowerV16I8VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)10094 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10095                                        SDValue V1, SDValue V2,
10096                                        const X86Subtarget &Subtarget,
10097                                        SelectionDAG &DAG) {
10098   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10099   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10100   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10101 
10102   // Try to use shift instructions.
10103   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10104                                                 Subtarget, DAG))
10105     return Shift;
10106 
10107   // Try to use byte rotation instructions.
10108   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10109           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10110     return Rotate;
10111 
10112   // Try to use a zext lowering.
10113   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10114           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10115     return ZExt;
10116 
10117   // See if we can use SSE4A Extraction / Insertion.
10118   if (Subtarget.hasSSE4A())
10119     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
10120       return V;
10121 
10122   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10123 
10124   // For single-input shuffles, there are some nicer lowering tricks we can use.
10125   if (NumV2Elements == 0) {
10126     // Check for being able to broadcast a single element.
10127     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10128             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10129       return Broadcast;
10130 
10131     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10132     // Notably, this handles splat and partial-splat shuffles more efficiently.
10133     // However, it only makes sense if the pre-duplication shuffle simplifies
10134     // things significantly. Currently, this means we need to be able to
10135     // express the pre-duplication shuffle as an i16 shuffle.
10136     //
10137     // FIXME: We should check for other patterns which can be widened into an
10138     // i16 shuffle as well.
10139     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10140       for (int i = 0; i < 16; i += 2)
10141         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10142           return false;
10143 
10144       return true;
10145     };
10146     auto tryToWidenViaDuplication = [&]() -> SDValue {
10147       if (!canWidenViaDuplication(Mask))
10148         return SDValue();
10149       SmallVector<int, 4> LoInputs;
10150       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10151                    [](int M) { return M >= 0 && M < 8; });
10152       std::sort(LoInputs.begin(), LoInputs.end());
10153       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10154                      LoInputs.end());
10155       SmallVector<int, 4> HiInputs;
10156       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10157                    [](int M) { return M >= 8; });
10158       std::sort(HiInputs.begin(), HiInputs.end());
10159       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10160                      HiInputs.end());
10161 
10162       bool TargetLo = LoInputs.size() >= HiInputs.size();
10163       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10164       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
10165 
10166       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
10167       SmallDenseMap<int, int, 8> LaneMap;
10168       for (int I : InPlaceInputs) {
10169         PreDupI16Shuffle[I/2] = I/2;
10170         LaneMap[I] = I;
10171       }
10172       int j = TargetLo ? 0 : 4, je = j + 4;
10173       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
10174         // Check if j is already a shuffle of this input. This happens when
10175         // there are two adjacent bytes after we move the low one.
10176         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
10177           // If we haven't yet mapped the input, search for a slot into which
10178           // we can map it.
10179           while (j < je && PreDupI16Shuffle[j] >= 0)
10180             ++j;
10181 
10182           if (j == je)
10183             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
10184             return SDValue();
10185 
10186           // Map this input with the i16 shuffle.
10187           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
10188         }
10189 
10190         // Update the lane map based on the mapping we ended up with.
10191         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
10192       }
10193       V1 = DAG.getBitcast(
10194           MVT::v16i8,
10195           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10196                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
10197 
10198       // Unpack the bytes to form the i16s that will be shuffled into place.
10199       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10200                        MVT::v16i8, V1, V1);
10201 
10202       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10203       for (int i = 0; i < 16; ++i)
10204         if (Mask[i] >= 0) {
10205           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
10206           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
10207           if (PostDupI16Shuffle[i / 2] < 0)
10208             PostDupI16Shuffle[i / 2] = MappedMask;
10209           else
10210             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
10211                    "Conflicting entrties in the original shuffle!");
10212         }
10213       return DAG.getBitcast(
10214           MVT::v16i8,
10215           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10216                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
10217     };
10218     if (SDValue V = tryToWidenViaDuplication())
10219       return V;
10220   }
10221 
10222   if (SDValue Masked =
10223           lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
10224     return Masked;
10225 
10226   // Use dedicated unpack instructions for masks that match their pattern.
10227   if (SDValue V =
10228           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
10229     return V;
10230 
10231   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
10232   // with PSHUFB. It is important to do this before we attempt to generate any
10233   // blends but after all of the single-input lowerings. If the single input
10234   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
10235   // want to preserve that and we can DAG combine any longer sequences into
10236   // a PSHUFB in the end. But once we start blending from multiple inputs,
10237   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
10238   // and there are *very* few patterns that would actually be faster than the
10239   // PSHUFB approach because of its ability to zero lanes.
10240   //
10241   // FIXME: The only exceptions to the above are blends which are exact
10242   // interleavings with direct instructions supporting them. We currently don't
10243   // handle those well here.
10244   if (Subtarget.hasSSSE3()) {
10245     bool V1InUse = false;
10246     bool V2InUse = false;
10247 
10248     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
10249         DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
10250 
10251     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
10252     // do so. This avoids using them to handle blends-with-zero which is
10253     // important as a single pshufb is significantly faster for that.
10254     if (V1InUse && V2InUse) {
10255       if (Subtarget.hasSSE41())
10256         if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
10257                                                       Mask, Subtarget, DAG))
10258           return Blend;
10259 
10260       // We can use an unpack to do the blending rather than an or in some
10261       // cases. Even though the or may be (very minorly) more efficient, we
10262       // preference this lowering because there are common cases where part of
10263       // the complexity of the shuffles goes away when we do the final blend as
10264       // an unpack.
10265       // FIXME: It might be worth trying to detect if the unpack-feeding
10266       // shuffles will both be pshufb, in which case we shouldn't bother with
10267       // this.
10268       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10269               DL, MVT::v16i8, V1, V2, Mask, DAG))
10270         return Unpack;
10271     }
10272 
10273     return PSHUFB;
10274   }
10275 
10276   // There are special ways we can lower some single-element blends.
10277   if (NumV2Elements == 1)
10278     if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
10279                                                          Mask, Subtarget, DAG))
10280       return V;
10281 
10282   if (SDValue BitBlend =
10283           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
10284     return BitBlend;
10285 
10286   // Check whether a compaction lowering can be done. This handles shuffles
10287   // which take every Nth element for some even N. See the helper function for
10288   // details.
10289   //
10290   // We special case these as they can be particularly efficiently handled with
10291   // the PACKUSB instruction on x86 and they show up in common patterns of
10292   // rearranging bytes to truncate wide elements.
10293   bool IsSingleInput = V2.isUndef();
10294   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
10295     // NumEvenDrops is the power of two stride of the elements. Another way of
10296     // thinking about it is that we need to drop the even elements this many
10297     // times to get the original input.
10298 
10299     // First we need to zero all the dropped bytes.
10300     assert(NumEvenDrops <= 3 &&
10301            "No support for dropping even elements more than 3 times.");
10302     // We use the mask type to pick which bytes are preserved based on how many
10303     // elements are dropped.
10304     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
10305     SDValue ByteClearMask = DAG.getBitcast(
10306         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
10307     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
10308     if (!IsSingleInput)
10309       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
10310 
10311     // Now pack things back together.
10312     V1 = DAG.getBitcast(MVT::v8i16, V1);
10313     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
10314     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
10315     for (int i = 1; i < NumEvenDrops; ++i) {
10316       Result = DAG.getBitcast(MVT::v8i16, Result);
10317       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
10318     }
10319 
10320     return Result;
10321   }
10322 
10323   // Handle multi-input cases by blending single-input shuffles.
10324   if (NumV2Elements > 0)
10325     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
10326                                                       Mask, DAG);
10327 
10328   // The fallback path for single-input shuffles widens this into two v8i16
10329   // vectors with unpacks, shuffles those, and then pulls them back together
10330   // with a pack.
10331   SDValue V = V1;
10332 
10333   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10334   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10335   for (int i = 0; i < 16; ++i)
10336     if (Mask[i] >= 0)
10337       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
10338 
10339   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
10340 
10341   SDValue VLoHalf, VHiHalf;
10342   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
10343   // them out and avoid using UNPCK{L,H} to extract the elements of V as
10344   // i16s.
10345   if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
10346                    [](int M) { return M >= 0 && M % 2 == 1; }) &&
10347       std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
10348                    [](int M) { return M >= 0 && M % 2 == 1; })) {
10349     // Use a mask to drop the high bytes.
10350     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
10351     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
10352                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
10353 
10354     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
10355     VHiHalf = DAG.getUNDEF(MVT::v8i16);
10356 
10357     // Squash the masks to point directly into VLoHalf.
10358     for (int &M : LoBlendMask)
10359       if (M >= 0)
10360         M /= 2;
10361     for (int &M : HiBlendMask)
10362       if (M >= 0)
10363         M /= 2;
10364   } else {
10365     // Otherwise just unpack the low half of V into VLoHalf and the high half into
10366     // VHiHalf so that we can blend them as i16s.
10367     VLoHalf = DAG.getBitcast(
10368         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
10369     VHiHalf = DAG.getBitcast(
10370         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
10371   }
10372 
10373   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
10374   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
10375 
10376   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
10377 }
10378 
10379 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
10380 ///
10381 /// This routine breaks down the specific type of 128-bit shuffle and
10382 /// dispatches to the lowering routines accordingly.
lower128BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)10383 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10384                                         MVT VT, SDValue V1, SDValue V2,
10385                                         const X86Subtarget &Subtarget,
10386                                         SelectionDAG &DAG) {
10387   switch (VT.SimpleTy) {
10388   case MVT::v2i64:
10389     return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10390   case MVT::v2f64:
10391     return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10392   case MVT::v4i32:
10393     return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10394   case MVT::v4f32:
10395     return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10396   case MVT::v8i16:
10397     return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10398   case MVT::v16i8:
10399     return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10400 
10401   default:
10402     llvm_unreachable("Unimplemented!");
10403   }
10404 }
10405 
10406 /// \brief Helper function to test whether a shuffle mask could be
10407 /// simplified by widening the elements being shuffled.
10408 ///
10409 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10410 /// leaves it in an unspecified state.
10411 ///
10412 /// NOTE: This must handle normal vector shuffle masks and *target* vector
10413 /// shuffle masks. The latter have the special property of a '-2' representing
10414 /// a zero-ed lane of a vector.
canWidenShuffleElements(ArrayRef<int> Mask,SmallVectorImpl<int> & WidenedMask)10415 static bool canWidenShuffleElements(ArrayRef<int> Mask,
10416                                     SmallVectorImpl<int> &WidenedMask) {
10417   WidenedMask.assign(Mask.size() / 2, 0);
10418   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10419     // If both elements are undef, its trivial.
10420     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10421       WidenedMask[i/2] = SM_SentinelUndef;
10422       continue;
10423     }
10424 
10425     // Check for an undef mask and a mask value properly aligned to fit with
10426     // a pair of values. If we find such a case, use the non-undef mask's value.
10427     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10428       WidenedMask[i/2] = Mask[i + 1] / 2;
10429       continue;
10430     }
10431     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10432       WidenedMask[i/2] = Mask[i] / 2;
10433       continue;
10434     }
10435 
10436     // When zeroing, we need to spread the zeroing across both lanes to widen.
10437     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10438       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10439           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10440         WidenedMask[i/2] = SM_SentinelZero;
10441         continue;
10442       }
10443       return false;
10444     }
10445 
10446     // Finally check if the two mask values are adjacent and aligned with
10447     // a pair.
10448     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10449       WidenedMask[i/2] = Mask[i] / 2;
10450       continue;
10451     }
10452 
10453     // Otherwise we can't safely widen the elements used in this shuffle.
10454     return false;
10455   }
10456   assert(WidenedMask.size() == Mask.size() / 2 &&
10457          "Incorrect size of mask after widening the elements!");
10458 
10459   return true;
10460 }
10461 
10462 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
10463 ///
10464 /// This routine just extracts two subvectors, shuffles them independently, and
10465 /// then concatenates them back together. This should work effectively with all
10466 /// AVX vector shuffle types.
splitAndLowerVectorShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)10467 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10468                                           SDValue V2, ArrayRef<int> Mask,
10469                                           SelectionDAG &DAG) {
10470   assert(VT.getSizeInBits() >= 256 &&
10471          "Only for 256-bit or wider vector shuffles!");
10472   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10473   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10474 
10475   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10476   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10477 
10478   int NumElements = VT.getVectorNumElements();
10479   int SplitNumElements = NumElements / 2;
10480   MVT ScalarVT = VT.getVectorElementType();
10481   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10482 
10483   // Rather than splitting build-vectors, just build two narrower build
10484   // vectors. This helps shuffling with splats and zeros.
10485   auto SplitVector = [&](SDValue V) {
10486     V = peekThroughBitcasts(V);
10487 
10488     MVT OrigVT = V.getSimpleValueType();
10489     int OrigNumElements = OrigVT.getVectorNumElements();
10490     int OrigSplitNumElements = OrigNumElements / 2;
10491     MVT OrigScalarVT = OrigVT.getVectorElementType();
10492     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
10493 
10494     SDValue LoV, HiV;
10495 
10496     auto *BV = dyn_cast<BuildVectorSDNode>(V);
10497     if (!BV) {
10498       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10499                         DAG.getIntPtrConstant(0, DL));
10500       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10501                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
10502     } else {
10503 
10504       SmallVector<SDValue, 16> LoOps, HiOps;
10505       for (int i = 0; i < OrigSplitNumElements; ++i) {
10506         LoOps.push_back(BV->getOperand(i));
10507         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
10508       }
10509       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
10510       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
10511     }
10512     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
10513                           DAG.getBitcast(SplitVT, HiV));
10514   };
10515 
10516   SDValue LoV1, HiV1, LoV2, HiV2;
10517   std::tie(LoV1, HiV1) = SplitVector(V1);
10518   std::tie(LoV2, HiV2) = SplitVector(V2);
10519 
10520   // Now create two 4-way blends of these half-width vectors.
10521   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10522     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10523     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
10524     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
10525     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
10526     for (int i = 0; i < SplitNumElements; ++i) {
10527       int M = HalfMask[i];
10528       if (M >= NumElements) {
10529         if (M >= NumElements + SplitNumElements)
10530           UseHiV2 = true;
10531         else
10532           UseLoV2 = true;
10533         V2BlendMask[i] = M - NumElements;
10534         BlendMask[i] = SplitNumElements + i;
10535       } else if (M >= 0) {
10536         if (M >= SplitNumElements)
10537           UseHiV1 = true;
10538         else
10539           UseLoV1 = true;
10540         V1BlendMask[i] = M;
10541         BlendMask[i] = i;
10542       }
10543     }
10544 
10545     // Because the lowering happens after all combining takes place, we need to
10546     // manually combine these blend masks as much as possible so that we create
10547     // a minimal number of high-level vector shuffle nodes.
10548 
10549     // First try just blending the halves of V1 or V2.
10550     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10551       return DAG.getUNDEF(SplitVT);
10552     if (!UseLoV2 && !UseHiV2)
10553       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10554     if (!UseLoV1 && !UseHiV1)
10555       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10556 
10557     SDValue V1Blend, V2Blend;
10558     if (UseLoV1 && UseHiV1) {
10559       V1Blend =
10560         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10561     } else {
10562       // We only use half of V1 so map the usage down into the final blend mask.
10563       V1Blend = UseLoV1 ? LoV1 : HiV1;
10564       for (int i = 0; i < SplitNumElements; ++i)
10565         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10566           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10567     }
10568     if (UseLoV2 && UseHiV2) {
10569       V2Blend =
10570         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10571     } else {
10572       // We only use half of V2 so map the usage down into the final blend mask.
10573       V2Blend = UseLoV2 ? LoV2 : HiV2;
10574       for (int i = 0; i < SplitNumElements; ++i)
10575         if (BlendMask[i] >= SplitNumElements)
10576           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10577     }
10578     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10579   };
10580   SDValue Lo = HalfBlend(LoMask);
10581   SDValue Hi = HalfBlend(HiMask);
10582   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10583 }
10584 
10585 /// \brief Either split a vector in halves or decompose the shuffles and the
10586 /// blend.
10587 ///
10588 /// This is provided as a good fallback for many lowerings of non-single-input
10589 /// shuffles with more than one 128-bit lane. In those cases, we want to select
10590 /// between splitting the shuffle into 128-bit components and stitching those
10591 /// back together vs. extracting the single-input shuffles and blending those
10592 /// results.
lowerVectorShuffleAsSplitOrBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)10593 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
10594                                                 SDValue V1, SDValue V2,
10595                                                 ArrayRef<int> Mask,
10596                                                 SelectionDAG &DAG) {
10597   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
10598          "shuffles as it could then recurse on itself.");
10599   int Size = Mask.size();
10600 
10601   // If this can be modeled as a broadcast of two elements followed by a blend,
10602   // prefer that lowering. This is especially important because broadcasts can
10603   // often fold with memory operands.
10604   auto DoBothBroadcast = [&] {
10605     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10606     for (int M : Mask)
10607       if (M >= Size) {
10608         if (V2BroadcastIdx < 0)
10609           V2BroadcastIdx = M - Size;
10610         else if (M - Size != V2BroadcastIdx)
10611           return false;
10612       } else if (M >= 0) {
10613         if (V1BroadcastIdx < 0)
10614           V1BroadcastIdx = M;
10615         else if (M != V1BroadcastIdx)
10616           return false;
10617       }
10618     return true;
10619   };
10620   if (DoBothBroadcast())
10621     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10622                                                       DAG);
10623 
10624   // If the inputs all stem from a single 128-bit lane of each input, then we
10625   // split them rather than blending because the split will decompose to
10626   // unusually few instructions.
10627   int LaneCount = VT.getSizeInBits() / 128;
10628   int LaneSize = Size / LaneCount;
10629   SmallBitVector LaneInputs[2];
10630   LaneInputs[0].resize(LaneCount, false);
10631   LaneInputs[1].resize(LaneCount, false);
10632   for (int i = 0; i < Size; ++i)
10633     if (Mask[i] >= 0)
10634       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10635   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10636     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10637 
10638   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10639   // that the decomposed single-input shuffles don't end up here.
10640   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10641 }
10642 
10643 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10644 /// a permutation and blend of those lanes.
10645 ///
10646 /// This essentially blends the out-of-lane inputs to each lane into the lane
10647 /// from a permuted copy of the vector. This lowering strategy results in four
10648 /// instructions in the worst case for a single-input cross lane shuffle which
10649 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10650 /// of. Special cases for each particular shuffle pattern should be handled
10651 /// prior to trying this lowering.
lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,SelectionDAG & DAG)10652 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
10653                                                        SDValue V1, SDValue V2,
10654                                                        ArrayRef<int> Mask,
10655                                                        SelectionDAG &DAG) {
10656   // FIXME: This should probably be generalized for 512-bit vectors as well.
10657   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
10658   int Size = Mask.size();
10659   int LaneSize = Size / 2;
10660 
10661   // If there are only inputs from one 128-bit lane, splitting will in fact be
10662   // less expensive. The flags track whether the given lane contains an element
10663   // that crosses to another lane.
10664   bool LaneCrossing[2] = {false, false};
10665   for (int i = 0; i < Size; ++i)
10666     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10667       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10668   if (!LaneCrossing[0] || !LaneCrossing[1])
10669     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10670 
10671   assert(V2.isUndef() &&
10672          "This last part of this routine only works on single input shuffles");
10673 
10674   SmallVector<int, 32> FlippedBlendMask(Size);
10675   for (int i = 0; i < Size; ++i)
10676     FlippedBlendMask[i] =
10677         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10678                                 ? Mask[i]
10679                                 : Mask[i] % LaneSize +
10680                                       (i / LaneSize) * LaneSize + Size);
10681 
10682   // Flip the vector, and blend the results which should now be in-lane. The
10683   // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10684   // 5 for the high source. The value 3 selects the high half of source 2 and
10685   // the value 2 selects the low half of source 2. We only use source 2 to
10686   // allow folding it into a memory operand.
10687   unsigned PERMMask = 3 | 2 << 4;
10688   SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10689                                 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
10690   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10691 }
10692 
10693 /// \brief Handle lowering 2-lane 128-bit shuffles.
lowerV2X128VectorShuffle(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10694 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10695                                         SDValue V2, ArrayRef<int> Mask,
10696                                         const X86Subtarget &Subtarget,
10697                                         SelectionDAG &DAG) {
10698   // TODO: If minimizing size and one of the inputs is a zero vector and the
10699   // the zero vector has only one use, we could use a VPERM2X128 to save the
10700   // instruction bytes needed to explicitly generate the zero vector.
10701 
10702   // Blends are faster and handle all the non-lane-crossing cases.
10703   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10704                                                 Subtarget, DAG))
10705     return Blend;
10706 
10707   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
10708   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
10709 
10710   // If either input operand is a zero vector, use VPERM2X128 because its mask
10711   // allows us to replace the zero input with an implicit zero.
10712   if (!IsV1Zero && !IsV2Zero) {
10713     // Check for patterns which can be matched with a single insert of a 128-bit
10714     // subvector.
10715     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
10716     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
10717       // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
10718       if (Subtarget.hasAVX2() && V2.isUndef())
10719         return SDValue();
10720 
10721       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10722                                    VT.getVectorNumElements() / 2);
10723       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10724                                 DAG.getIntPtrConstant(0, DL));
10725       SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10726                                 OnlyUsesV1 ? V1 : V2,
10727                                 DAG.getIntPtrConstant(0, DL));
10728       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10729     }
10730   }
10731 
10732   // Otherwise form a 128-bit permutation. After accounting for undefs,
10733   // convert the 64-bit shuffle mask selection values into 128-bit
10734   // selection bits by dividing the indexes by 2 and shifting into positions
10735   // defined by a vperm2*128 instruction's immediate control byte.
10736 
10737   // The immediate permute control byte looks like this:
10738   //    [1:0] - select 128 bits from sources for low half of destination
10739   //    [2]   - ignore
10740   //    [3]   - zero low half of destination
10741   //    [5:4] - select 128 bits from sources for high half of destination
10742   //    [6]   - ignore
10743   //    [7]   - zero high half of destination
10744 
10745   int MaskLO = Mask[0];
10746   if (MaskLO == SM_SentinelUndef)
10747     MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
10748 
10749   int MaskHI = Mask[2];
10750   if (MaskHI == SM_SentinelUndef)
10751     MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
10752 
10753   unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
10754 
10755   // If either input is a zero vector, replace it with an undef input.
10756   // Shuffle mask values <  4 are selecting elements of V1.
10757   // Shuffle mask values >= 4 are selecting elements of V2.
10758   // Adjust each half of the permute mask by clearing the half that was
10759   // selecting the zero vector and setting the zero mask bit.
10760   if (IsV1Zero) {
10761     V1 = DAG.getUNDEF(VT);
10762     if (MaskLO < 4)
10763       PermMask = (PermMask & 0xf0) | 0x08;
10764     if (MaskHI < 4)
10765       PermMask = (PermMask & 0x0f) | 0x80;
10766   }
10767   if (IsV2Zero) {
10768     V2 = DAG.getUNDEF(VT);
10769     if (MaskLO >= 4)
10770       PermMask = (PermMask & 0xf0) | 0x08;
10771     if (MaskHI >= 4)
10772       PermMask = (PermMask & 0x0f) | 0x80;
10773   }
10774 
10775   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10776                      DAG.getConstant(PermMask, DL, MVT::i8));
10777 }
10778 
10779 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10780 /// shuffling each lane.
10781 ///
10782 /// This will only succeed when the result of fixing the 128-bit lanes results
10783 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10784 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10785 /// the lane crosses early and then use simpler shuffles within each lane.
10786 ///
10787 /// FIXME: It might be worthwhile at some point to support this without
10788 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10789 /// in x86 only floating point has interesting non-repeating shuffles, and even
10790 /// those are still *marginally* more expensive.
lowerVectorShuffleByMerging128BitLanes(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10791 static SDValue lowerVectorShuffleByMerging128BitLanes(
10792     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10793     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10794   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
10795 
10796   int Size = Mask.size();
10797   int LaneSize = 128 / VT.getScalarSizeInBits();
10798   int NumLanes = Size / LaneSize;
10799   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10800 
10801   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10802   // check whether the in-128-bit lane shuffles share a repeating pattern.
10803   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
10804   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
10805   for (int i = 0; i < Size; ++i) {
10806     if (Mask[i] < 0)
10807       continue;
10808 
10809     int j = i / LaneSize;
10810 
10811     if (Lanes[j] < 0) {
10812       // First entry we've seen for this lane.
10813       Lanes[j] = Mask[i] / LaneSize;
10814     } else if (Lanes[j] != Mask[i] / LaneSize) {
10815       // This doesn't match the lane selected previously!
10816       return SDValue();
10817     }
10818 
10819     // Check that within each lane we have a consistent shuffle mask.
10820     int k = i % LaneSize;
10821     if (InLaneMask[k] < 0) {
10822       InLaneMask[k] = Mask[i] % LaneSize;
10823     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10824       // This doesn't fit a repeating in-lane mask.
10825       return SDValue();
10826     }
10827   }
10828 
10829   // First shuffle the lanes into place.
10830   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10831                                 VT.getSizeInBits() / 64);
10832   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
10833   for (int i = 0; i < NumLanes; ++i)
10834     if (Lanes[i] >= 0) {
10835       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10836       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10837     }
10838 
10839   V1 = DAG.getBitcast(LaneVT, V1);
10840   V2 = DAG.getBitcast(LaneVT, V2);
10841   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10842 
10843   // Cast it back to the type we actually want.
10844   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
10845 
10846   // Now do a simple shuffle that isn't lane crossing.
10847   SmallVector<int, 8> NewMask((unsigned)Size, -1);
10848   for (int i = 0; i < Size; ++i)
10849     if (Mask[i] >= 0)
10850       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10851   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10852          "Must not introduce lane crosses at this point!");
10853 
10854   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10855 }
10856 
10857 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
10858 /// This allows for fast cases such as subvector extraction/insertion
10859 /// or shuffling smaller vector types which can lower more efficiently.
lowerVectorShuffleWithUndefHalf(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10860 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
10861                                                SDValue V1, SDValue V2,
10862                                                ArrayRef<int> Mask,
10863                                                const X86Subtarget &Subtarget,
10864                                                SelectionDAG &DAG) {
10865   assert(VT.is256BitVector() && "Expected 256-bit vector");
10866 
10867   unsigned NumElts = VT.getVectorNumElements();
10868   unsigned HalfNumElts = NumElts / 2;
10869   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
10870 
10871   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
10872   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
10873   if (!UndefLower && !UndefUpper)
10874     return SDValue();
10875 
10876   // Upper half is undef and lower half is whole upper subvector.
10877   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
10878   if (UndefUpper &&
10879       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
10880     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10881                              DAG.getIntPtrConstant(HalfNumElts, DL));
10882     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10883                        DAG.getIntPtrConstant(0, DL));
10884   }
10885 
10886   // Lower half is undef and upper half is whole lower subvector.
10887   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
10888   if (UndefLower &&
10889       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
10890     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10891                              DAG.getIntPtrConstant(0, DL));
10892     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10893                        DAG.getIntPtrConstant(HalfNumElts, DL));
10894   }
10895 
10896   // If the shuffle only uses two of the four halves of the input operands,
10897   // then extract them and perform the 'half' shuffle at half width.
10898   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
10899   int HalfIdx1 = -1, HalfIdx2 = -1;
10900   SmallVector<int, 8> HalfMask(HalfNumElts);
10901   unsigned Offset = UndefLower ? HalfNumElts : 0;
10902   for (unsigned i = 0; i != HalfNumElts; ++i) {
10903     int M = Mask[i + Offset];
10904     if (M < 0) {
10905       HalfMask[i] = M;
10906       continue;
10907     }
10908 
10909     // Determine which of the 4 half vectors this element is from.
10910     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
10911     int HalfIdx = M / HalfNumElts;
10912 
10913     // Determine the element index into its half vector source.
10914     int HalfElt = M % HalfNumElts;
10915 
10916     // We can shuffle with up to 2 half vectors, set the new 'half'
10917     // shuffle mask accordingly.
10918     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
10919       HalfMask[i] = HalfElt;
10920       HalfIdx1 = HalfIdx;
10921       continue;
10922     }
10923     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
10924       HalfMask[i] = HalfElt + HalfNumElts;
10925       HalfIdx2 = HalfIdx;
10926       continue;
10927     }
10928 
10929     // Too many half vectors referenced.
10930     return SDValue();
10931   }
10932   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
10933 
10934   // Only shuffle the halves of the inputs when useful.
10935   int NumLowerHalves =
10936       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
10937   int NumUpperHalves =
10938       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
10939 
10940   // uuuuXXXX - don't extract uppers just to insert again.
10941   if (UndefLower && NumUpperHalves != 0)
10942     return SDValue();
10943 
10944   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
10945   if (UndefUpper && NumUpperHalves == 2)
10946     return SDValue();
10947 
10948   // AVX2 - XXXXuuuu - always extract lowers.
10949   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
10950     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
10951     if (VT == MVT::v4f64 || VT == MVT::v4i64)
10952       return SDValue();
10953     // AVX2 supports variable 32-bit element cross-lane shuffles.
10954     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
10955       // XXXXuuuu - don't extract lowers and uppers.
10956       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
10957         return SDValue();
10958     }
10959   }
10960 
10961   auto GetHalfVector = [&](int HalfIdx) {
10962     if (HalfIdx < 0)
10963       return DAG.getUNDEF(HalfVT);
10964     SDValue V = (HalfIdx < 2 ? V1 : V2);
10965     HalfIdx = (HalfIdx % 2) * HalfNumElts;
10966     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
10967                        DAG.getIntPtrConstant(HalfIdx, DL));
10968   };
10969 
10970   SDValue Half1 = GetHalfVector(HalfIdx1);
10971   SDValue Half2 = GetHalfVector(HalfIdx2);
10972   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
10973   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
10974                      DAG.getIntPtrConstant(Offset, DL));
10975 }
10976 
10977 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10978 /// given mask.
10979 ///
10980 /// This returns true if the elements from a particular input are already in the
10981 /// slot required by the given mask and require no permutation.
isShuffleMaskInputInPlace(int Input,ArrayRef<int> Mask)10982 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10983   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10984   int Size = Mask.size();
10985   for (int i = 0; i < Size; ++i)
10986     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10987       return false;
10988 
10989   return true;
10990 }
10991 
10992 /// Handle case where shuffle sources are coming from the same 128-bit lane and
10993 /// every lane can be represented as the same repeating mask - allowing us to
10994 /// shuffle the sources with the repeating shuffle and then permute the result
10995 /// to the destination lanes.
lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc & DL,MVT VT,SDValue V1,SDValue V2,ArrayRef<int> Mask,const X86Subtarget & Subtarget,SelectionDAG & DAG)10996 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
10997     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10998     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10999   int NumElts = VT.getVectorNumElements();
11000   int NumLanes = VT.getSizeInBits() / 128;
11001   int NumLaneElts = NumElts / NumLanes;
11002 
11003   // On AVX2 we may be able to just shuffle the lowest elements and then
11004   // broadcast the result.
11005   if (Subtarget.hasAVX2()) {
11006     for (unsigned BroadcastSize : {16, 32, 64}) {
11007       if (BroadcastSize <= VT.getScalarSizeInBits())
11008         continue;
11009       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11010 
11011       // Attempt to match a repeating pattern every NumBroadcastElts,
11012       // accounting for UNDEFs but only references the lowest 128-bit
11013       // lane of the inputs.
11014       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11015         for (int i = 0; i != NumElts; i += NumBroadcastElts)
11016           for (int j = 0; j != NumBroadcastElts; ++j) {
11017             int M = Mask[i + j];
11018             if (M < 0)
11019               continue;
11020             int &R = RepeatMask[j];
11021             if (0 != ((M % NumElts) / NumLaneElts))
11022               return false;
11023             if (0 <= R && R != M)
11024               return false;
11025             R = M;
11026           }
11027         return true;
11028       };
11029 
11030       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11031       if (!FindRepeatingBroadcastMask(RepeatMask))
11032         continue;
11033 
11034       // Shuffle the (lowest) repeated elements in place for broadcast.
11035       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11036 
11037       // Shuffle the actual broadcast.
11038       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11039       for (int i = 0; i != NumElts; i += NumBroadcastElts)
11040         for (int j = 0; j != NumBroadcastElts; ++j)
11041           BroadcastMask[i + j] = j;
11042       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11043                                   BroadcastMask);
11044     }
11045   }
11046 
11047   // Bail if we already have a repeated lane shuffle mask.
11048   SmallVector<int, 8> RepeatedShuffleMask;
11049   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11050     return SDValue();
11051 
11052   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11053   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11054   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11055   int NumSubLanes = NumLanes * SubLaneScale;
11056   int NumSubLaneElts = NumLaneElts / SubLaneScale;
11057 
11058   // Check that all the sources are coming from the same lane and see if we
11059   // can form a repeating shuffle mask (local to each lane). At the same time,
11060   // determine the source sub-lane for each destination sub-lane.
11061   int TopSrcSubLane = -1;
11062   SmallVector<int, 8> RepeatedLaneMask((unsigned)NumLaneElts, -1);
11063   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11064   for (int i = 0; i != NumElts; ++i) {
11065     int M = Mask[i];
11066     if (M < 0)
11067       continue;
11068     assert(0 <= M && M < 2 * NumElts);
11069 
11070     // Check that the local mask index is the same for every lane. We always do
11071     // this with 128-bit lanes to match in is128BitLaneRepeatedShuffleMask.
11072     int LocalM = M < NumElts ? (M % NumLaneElts) : (M % NumLaneElts) + NumElts;
11073     int &RepeatM = RepeatedLaneMask[i % NumLaneElts];
11074     if (0 <= RepeatM && RepeatM != LocalM)
11075       return SDValue();
11076     RepeatM = LocalM;
11077 
11078     // Check that the whole of each destination sub-lane comes from the same
11079     // sub-lane, we need to calculate the source based off where the repeated
11080     // lane mask will have left it.
11081     int SrcLane = (M % NumElts) / NumLaneElts;
11082     int SrcSubLane = (SrcLane * SubLaneScale) +
11083                      ((i % NumLaneElts) / NumSubLaneElts);
11084     int &Dst2SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11085     if (0 <= Dst2SrcSubLane && SrcSubLane != Dst2SrcSubLane)
11086       return SDValue();
11087     Dst2SrcSubLane = SrcSubLane;
11088 
11089     // Track the top most source sub-lane - by setting the remaining to UNDEF
11090     // we can greatly simplify shuffle matching.
11091     TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11092   }
11093   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11094          "Unexpected source lane");
11095 
11096   // Create a repeating shuffle mask for the entire vector.
11097   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11098   for (int i = 0, e = ((TopSrcSubLane + 1) * NumSubLaneElts); i != e; ++i) {
11099     int M = RepeatedLaneMask[i % NumLaneElts];
11100     if (M < 0)
11101       continue;
11102     int Lane = i / NumLaneElts;
11103     RepeatedMask[i] = M + (Lane * NumLaneElts);
11104   }
11105   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11106 
11107   // Shuffle each source sub-lane to its destination.
11108   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11109   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11110     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11111     if (SrcSubLane < 0)
11112       continue;
11113     for (int j = 0; j != NumSubLaneElts; ++j)
11114       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11115   }
11116 
11117   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11118                               SubLaneMask);
11119 }
11120 
lowerVectorShuffleWithSHUFPD(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11121 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11122                                             ArrayRef<int> Mask, SDValue V1,
11123                                             SDValue V2, SelectionDAG &DAG) {
11124 
11125   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
11126   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
11127   assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
11128   int NumElts = VT.getVectorNumElements();
11129   bool ShufpdMask = true;
11130   bool CommutableMask = true;
11131   unsigned Immediate = 0;
11132   for (int i = 0; i < NumElts; ++i) {
11133     if (Mask[i] < 0)
11134       continue;
11135     int Val = (i & 6) + NumElts * (i & 1);
11136     int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
11137     if (Mask[i] < Val ||  Mask[i] > Val + 1)
11138       ShufpdMask = false;
11139     if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
11140       CommutableMask = false;
11141     Immediate |= (Mask[i] % 2) << i;
11142   }
11143   if (ShufpdMask)
11144     return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11145                        DAG.getConstant(Immediate, DL, MVT::i8));
11146   if (CommutableMask)
11147     return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11148                        DAG.getConstant(Immediate, DL, MVT::i8));
11149   return SDValue();
11150 }
11151 
11152 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
11153 ///
11154 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
11155 /// isn't available.
lowerV4F64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11156 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11157                                        SDValue V1, SDValue V2,
11158                                        const X86Subtarget &Subtarget,
11159                                        SelectionDAG &DAG) {
11160   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11161   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11162   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11163 
11164   SmallVector<int, 4> WidenedMask;
11165   if (canWidenShuffleElements(Mask, WidenedMask))
11166     if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
11167                                              Subtarget, DAG))
11168       return V;
11169 
11170   if (V2.isUndef()) {
11171     // Check for being able to broadcast a single element.
11172     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11173             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11174       return Broadcast;
11175 
11176     // Use low duplicate instructions for masks that match their pattern.
11177     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11178       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
11179 
11180     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
11181       // Non-half-crossing single input shuffles can be lowered with an
11182       // interleaved permutation.
11183       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11184                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
11185       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
11186                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11187     }
11188 
11189     // With AVX2 we have direct support for this permutation.
11190     if (Subtarget.hasAVX2())
11191       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
11192                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11193 
11194     // Try to create an in-lane repeating shuffle mask and then shuffle the
11195     // the results into the target lanes.
11196     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11197             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11198       return V;
11199 
11200     // Otherwise, fall back.
11201     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
11202                                                    DAG);
11203   }
11204 
11205   // Use dedicated unpack instructions for masks that match their pattern.
11206   if (SDValue V =
11207           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
11208     return V;
11209 
11210   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
11211                                                 Subtarget, DAG))
11212     return Blend;
11213 
11214   // Check if the blend happens to exactly fit that of SHUFPD.
11215   if (SDValue Op =
11216       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
11217     return Op;
11218 
11219   // Try to create an in-lane repeating shuffle mask and then shuffle the
11220   // the results into the target lanes.
11221   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11222           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11223   return V;
11224 
11225   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11226   // shuffle. However, if we have AVX2 and either inputs are already in place,
11227   // we will be able to shuffle even across lanes the other input in a single
11228   // instruction so skip this pattern.
11229   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11230                                 isShuffleMaskInputInPlace(1, Mask))))
11231     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11232             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11233       return Result;
11234 
11235   // If we have AVX2 then we always want to lower with a blend because an v4 we
11236   // can fully permute the elements.
11237   if (Subtarget.hasAVX2())
11238     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
11239                                                       Mask, DAG);
11240 
11241   // Otherwise fall back on generic lowering.
11242   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
11243 }
11244 
11245 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
11246 ///
11247 /// This routine is only called when we have AVX2 and thus a reasonable
11248 /// instruction set for v4i64 shuffling..
lowerV4I64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11249 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11250                                        SDValue V1, SDValue V2,
11251                                        const X86Subtarget &Subtarget,
11252                                        SelectionDAG &DAG) {
11253   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11254   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11255   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11256   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
11257 
11258   SmallVector<int, 4> WidenedMask;
11259   if (canWidenShuffleElements(Mask, WidenedMask))
11260     if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
11261                                              Subtarget, DAG))
11262       return V;
11263 
11264   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
11265                                                 Subtarget, DAG))
11266     return Blend;
11267 
11268   // Check for being able to broadcast a single element.
11269   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
11270                                                         Mask, Subtarget, DAG))
11271     return Broadcast;
11272 
11273   if (V2.isUndef()) {
11274     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11275     // can use lower latency instructions that will operate on both lanes.
11276     SmallVector<int, 2> RepeatedMask;
11277     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
11278       SmallVector<int, 4> PSHUFDMask;
11279       scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
11280       return DAG.getBitcast(
11281           MVT::v4i64,
11282           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
11283                       DAG.getBitcast(MVT::v8i32, V1),
11284                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11285     }
11286 
11287     // AVX2 provides a direct instruction for permuting a single input across
11288     // lanes.
11289     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
11290                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11291   }
11292 
11293   // Try to use shift instructions.
11294   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
11295                                                 Subtarget, DAG))
11296     return Shift;
11297 
11298   // Use dedicated unpack instructions for masks that match their pattern.
11299   if (SDValue V =
11300           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
11301     return V;
11302 
11303   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11304   // shuffle. However, if we have AVX2 and either inputs are already in place,
11305   // we will be able to shuffle even across lanes the other input in a single
11306   // instruction so skip this pattern.
11307   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11308                                  isShuffleMaskInputInPlace(1, Mask))))
11309     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11310             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
11311       return Result;
11312 
11313   // Otherwise fall back on generic blend lowering.
11314   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
11315                                                     Mask, DAG);
11316 }
11317 
11318 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
11319 ///
11320 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
11321 /// isn't available.
lowerV8F32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11322 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11323                                        SDValue V1, SDValue V2,
11324                                        const X86Subtarget &Subtarget,
11325                                        SelectionDAG &DAG) {
11326   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11327   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11328   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11329 
11330   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
11331                                                 Subtarget, DAG))
11332     return Blend;
11333 
11334   // Check for being able to broadcast a single element.
11335   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
11336                                                         Mask, Subtarget, DAG))
11337     return Broadcast;
11338 
11339   // If the shuffle mask is repeated in each 128-bit lane, we have many more
11340   // options to efficiently lower the shuffle.
11341   SmallVector<int, 4> RepeatedMask;
11342   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
11343     assert(RepeatedMask.size() == 4 &&
11344            "Repeated masks must be half the mask width!");
11345 
11346     // Use even/odd duplicate instructions for masks that match their pattern.
11347     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11348       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
11349     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11350       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
11351 
11352     if (V2.isUndef())
11353       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
11354                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11355 
11356     // Use dedicated unpack instructions for masks that match their pattern.
11357     if (SDValue V =
11358             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
11359       return V;
11360 
11361     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
11362     // have already handled any direct blends.
11363     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
11364   }
11365 
11366   // Try to create an in-lane repeating shuffle mask and then shuffle the
11367   // the results into the target lanes.
11368   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11369           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11370     return V;
11371 
11372   // If we have a single input shuffle with different shuffle patterns in the
11373   // two 128-bit lanes use the variable mask to VPERMILPS.
11374   if (V2.isUndef()) {
11375     SDValue VPermMask[8];
11376     for (int i = 0; i < 8; ++i)
11377       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11378                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
11379     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
11380       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
11381                          DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
11382 
11383     if (Subtarget.hasAVX2())
11384       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
11385                          DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11386 
11387     // Otherwise, fall back.
11388     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
11389                                                    DAG);
11390   }
11391 
11392   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11393   // shuffle.
11394   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11395           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11396     return Result;
11397 
11398   // If we have AVX2 then we always want to lower with a blend because at v8 we
11399   // can fully permute the elements.
11400   if (Subtarget.hasAVX2())
11401     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
11402                                                       Mask, DAG);
11403 
11404   // Otherwise fall back on generic lowering.
11405   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
11406 }
11407 
11408 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
11409 ///
11410 /// This routine is only called when we have AVX2 and thus a reasonable
11411 /// instruction set for v8i32 shuffling..
lowerV8I32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11412 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11413                                        SDValue V1, SDValue V2,
11414                                        const X86Subtarget &Subtarget,
11415                                        SelectionDAG &DAG) {
11416   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11417   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11418   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11419   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
11420 
11421   // Whenever we can lower this as a zext, that instruction is strictly faster
11422   // than any alternative. It also allows us to fold memory operands into the
11423   // shuffle in many cases.
11424   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
11425                                                          Mask, Subtarget, DAG))
11426     return ZExt;
11427 
11428   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
11429                                                 Subtarget, DAG))
11430     return Blend;
11431 
11432   // Check for being able to broadcast a single element.
11433   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
11434                                                         Mask, Subtarget, DAG))
11435     return Broadcast;
11436 
11437   // If the shuffle mask is repeated in each 128-bit lane we can use more
11438   // efficient instructions that mirror the shuffles across the two 128-bit
11439   // lanes.
11440   SmallVector<int, 4> RepeatedMask;
11441   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
11442     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11443     if (V2.isUndef())
11444       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
11445                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11446 
11447     // Use dedicated unpack instructions for masks that match their pattern.
11448     if (SDValue V =
11449             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
11450       return V;
11451   }
11452 
11453   // Try to use shift instructions.
11454   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
11455                                                 Subtarget, DAG))
11456     return Shift;
11457 
11458   // Try to use byte rotation instructions.
11459   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11460           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11461     return Rotate;
11462 
11463   // Try to create an in-lane repeating shuffle mask and then shuffle the
11464   // the results into the target lanes.
11465   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11466           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11467     return V;
11468 
11469   // If the shuffle patterns aren't repeated but it is a single input, directly
11470   // generate a cross-lane VPERMD instruction.
11471   if (V2.isUndef()) {
11472     SDValue VPermMask[8];
11473     for (int i = 0; i < 8; ++i)
11474       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11475                                  : DAG.getConstant(Mask[i], DL, MVT::i32);
11476     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
11477                        DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11478   }
11479 
11480   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11481   // shuffle.
11482   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11483           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11484     return Result;
11485 
11486   // Otherwise fall back on generic blend lowering.
11487   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
11488                                                     Mask, DAG);
11489 }
11490 
11491 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
11492 ///
11493 /// This routine is only called when we have AVX2 and thus a reasonable
11494 /// instruction set for v16i16 shuffling..
lowerV16I16VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11495 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11496                                         SDValue V1, SDValue V2,
11497                                         const X86Subtarget &Subtarget,
11498                                         SelectionDAG &DAG) {
11499   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11500   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11501   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11502   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
11503 
11504   // Whenever we can lower this as a zext, that instruction is strictly faster
11505   // than any alternative. It also allows us to fold memory operands into the
11506   // shuffle in many cases.
11507   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
11508                                                          Mask, Subtarget, DAG))
11509     return ZExt;
11510 
11511   // Check for being able to broadcast a single element.
11512   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
11513                                                         Mask, Subtarget, DAG))
11514     return Broadcast;
11515 
11516   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
11517                                                 Subtarget, DAG))
11518     return Blend;
11519 
11520   // Use dedicated unpack instructions for masks that match their pattern.
11521   if (SDValue V =
11522           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
11523     return V;
11524 
11525   // Try to use shift instructions.
11526   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
11527                                                 Subtarget, DAG))
11528     return Shift;
11529 
11530   // Try to use byte rotation instructions.
11531   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11532           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11533     return Rotate;
11534 
11535   // Try to create an in-lane repeating shuffle mask and then shuffle the
11536   // the results into the target lanes.
11537   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11538           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11539     return V;
11540 
11541   if (V2.isUndef()) {
11542     // There are no generalized cross-lane shuffle operations available on i16
11543     // element types.
11544     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
11545       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
11546                                                      Mask, DAG);
11547 
11548     SmallVector<int, 8> RepeatedMask;
11549     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11550       // As this is a single-input shuffle, the repeated mask should be
11551       // a strictly valid v8i16 mask that we can pass through to the v8i16
11552       // lowering to handle even the v16 case.
11553       return lowerV8I16GeneralSingleInputVectorShuffle(
11554           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
11555     }
11556   }
11557 
11558   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
11559                                                     V2, Subtarget, DAG))
11560     return PSHUFB;
11561 
11562   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11563   // shuffle.
11564   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11565           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11566     return Result;
11567 
11568   // Otherwise fall back on generic lowering.
11569   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
11570 }
11571 
11572 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
11573 ///
11574 /// This routine is only called when we have AVX2 and thus a reasonable
11575 /// instruction set for v32i8 shuffling..
lowerV32I8VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11576 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11577                                        SDValue V1, SDValue V2,
11578                                        const X86Subtarget &Subtarget,
11579                                        SelectionDAG &DAG) {
11580   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11581   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11582   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11583   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
11584 
11585   // Whenever we can lower this as a zext, that instruction is strictly faster
11586   // than any alternative. It also allows us to fold memory operands into the
11587   // shuffle in many cases.
11588   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
11589                                                          Mask, Subtarget, DAG))
11590     return ZExt;
11591 
11592   // Check for being able to broadcast a single element.
11593   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
11594                                                         Mask, Subtarget, DAG))
11595     return Broadcast;
11596 
11597   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
11598                                                 Subtarget, DAG))
11599     return Blend;
11600 
11601   // Use dedicated unpack instructions for masks that match their pattern.
11602   if (SDValue V =
11603           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
11604     return V;
11605 
11606   // Try to use shift instructions.
11607   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
11608                                                 Subtarget, DAG))
11609     return Shift;
11610 
11611   // Try to use byte rotation instructions.
11612   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11613           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11614     return Rotate;
11615 
11616   // Try to create an in-lane repeating shuffle mask and then shuffle the
11617   // the results into the target lanes.
11618   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11619           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11620     return V;
11621 
11622   // There are no generalized cross-lane shuffle operations available on i8
11623   // element types.
11624   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
11625     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
11626                                                    DAG);
11627 
11628   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
11629                                                     V2, Subtarget, DAG))
11630     return PSHUFB;
11631 
11632   // Try to simplify this by merging 128-bit lanes to enable a lane-based
11633   // shuffle.
11634   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11635           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11636     return Result;
11637 
11638   // Otherwise fall back on generic lowering.
11639   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
11640 }
11641 
11642 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
11643 ///
11644 /// This routine either breaks down the specific type of a 256-bit x86 vector
11645 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
11646 /// together based on the available instructions.
lower256BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11647 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11648                                         MVT VT, SDValue V1, SDValue V2,
11649                                         const X86Subtarget &Subtarget,
11650                                         SelectionDAG &DAG) {
11651   // If we have a single input to the zero element, insert that into V1 if we
11652   // can do so cheaply.
11653   int NumElts = VT.getVectorNumElements();
11654   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
11655 
11656   if (NumV2Elements == 1 && Mask[0] >= NumElts)
11657     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11658                               DL, VT, V1, V2, Mask, Subtarget, DAG))
11659       return Insertion;
11660 
11661   // Handle special cases where the lower or upper half is UNDEF.
11662   if (SDValue V =
11663           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
11664     return V;
11665 
11666   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
11667   // can check for those subtargets here and avoid much of the subtarget
11668   // querying in the per-vector-type lowering routines. With AVX1 we have
11669   // essentially *zero* ability to manipulate a 256-bit vector with integer
11670   // types. Since we'll use floating point types there eventually, just
11671   // immediately cast everything to a float and operate entirely in that domain.
11672   if (VT.isInteger() && !Subtarget.hasAVX2()) {
11673     int ElementBits = VT.getScalarSizeInBits();
11674     if (ElementBits < 32) {
11675       // No floating point type available, if we can't use the bit operations
11676       // for masking/blending then decompose into 128-bit vectors.
11677       if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
11678         return V;
11679       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11680         return V;
11681       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11682     }
11683 
11684     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
11685                                 VT.getVectorNumElements());
11686     V1 = DAG.getBitcast(FpVT, V1);
11687     V2 = DAG.getBitcast(FpVT, V2);
11688     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
11689   }
11690 
11691   switch (VT.SimpleTy) {
11692   case MVT::v4f64:
11693     return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11694   case MVT::v4i64:
11695     return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11696   case MVT::v8f32:
11697     return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11698   case MVT::v8i32:
11699     return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11700   case MVT::v16i16:
11701     return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11702   case MVT::v32i8:
11703     return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11704 
11705   default:
11706     llvm_unreachable("Not a valid 256-bit x86 vector type!");
11707   }
11708 }
11709 
11710 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
lowerV4X128VectorShuffle(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11711 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
11712                                         ArrayRef<int> Mask, SDValue V1,
11713                                         SDValue V2, SelectionDAG &DAG) {
11714   assert(VT.getScalarSizeInBits() == 64 &&
11715          "Unexpected element type size for 128bit shuffle.");
11716 
11717   // To handle 256 bit vector requires VLX and most probably
11718   // function lowerV2X128VectorShuffle() is better solution.
11719   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
11720 
11721   SmallVector<int, 4> WidenedMask;
11722   if (!canWidenShuffleElements(Mask, WidenedMask))
11723     return SDValue();
11724 
11725   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11726   // Insure elements came from the same Op.
11727   int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
11728   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11729     if (WidenedMask[i] == SM_SentinelZero)
11730       return SDValue();
11731     if (WidenedMask[i] == SM_SentinelUndef)
11732       continue;
11733 
11734     SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
11735     unsigned OpIndex = (i < Size/2) ? 0 : 1;
11736     if (Ops[OpIndex].isUndef())
11737       Ops[OpIndex] = Op;
11738     else if (Ops[OpIndex] != Op)
11739       return SDValue();
11740   }
11741 
11742   // Form a 128-bit permutation.
11743   // Convert the 64-bit shuffle mask selection values into 128-bit selection
11744   // bits defined by a vshuf64x2 instruction's immediate control byte.
11745   unsigned PermMask = 0, Imm = 0;
11746   unsigned ControlBitsNum = WidenedMask.size() / 2;
11747 
11748   for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11749     // Use first element in place of undef mask.
11750     Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
11751     PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
11752   }
11753 
11754   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
11755                      DAG.getConstant(PermMask, DL, MVT::i8));
11756 }
11757 
lowerVectorShuffleWithPERMV(const SDLoc & DL,MVT VT,ArrayRef<int> Mask,SDValue V1,SDValue V2,SelectionDAG & DAG)11758 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11759                                            ArrayRef<int> Mask, SDValue V1,
11760                                            SDValue V2, SelectionDAG &DAG) {
11761 
11762   assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
11763 
11764   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11765   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11766 
11767   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11768   if (V2.isUndef())
11769     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11770 
11771   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
11772 }
11773 
11774 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
lowerV8F64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11775 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11776                                        SDValue V1, SDValue V2,
11777                                        const X86Subtarget &Subtarget,
11778                                        SelectionDAG &DAG) {
11779   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11780   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11781   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11782 
11783   if (V2.isUndef()) {
11784     // Use low duplicate instructions for masks that match their pattern.
11785     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
11786       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
11787 
11788     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
11789       // Non-half-crossing single input shuffles can be lowered with an
11790       // interleaved permutation.
11791       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11792                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
11793                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
11794                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
11795       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
11796                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11797     }
11798 
11799     SmallVector<int, 4> RepeatedMask;
11800     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
11801       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
11802                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11803   }
11804 
11805   if (SDValue Shuf128 =
11806           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
11807     return Shuf128;
11808 
11809   if (SDValue Unpck =
11810           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
11811     return Unpck;
11812 
11813   // Check if the blend happens to exactly fit that of SHUFPD.
11814   if (SDValue Op =
11815       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
11816     return Op;
11817 
11818   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
11819 }
11820 
11821 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
lowerV16F32VectorShuffle(SDLoc DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11822 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
11823                                         SDValue V1, SDValue V2,
11824                                         const X86Subtarget &Subtarget,
11825                                         SelectionDAG &DAG) {
11826   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11827   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11828   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11829 
11830   // If the shuffle mask is repeated in each 128-bit lane, we have many more
11831   // options to efficiently lower the shuffle.
11832   SmallVector<int, 4> RepeatedMask;
11833   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
11834     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11835 
11836     // Use even/odd duplicate instructions for masks that match their pattern.
11837     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11838       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
11839     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11840       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
11841 
11842     if (V2.isUndef())
11843       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
11844                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11845 
11846     // Use dedicated unpack instructions for masks that match their pattern.
11847     if (SDValue Unpck =
11848             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
11849       return Unpck;
11850 
11851     // Otherwise, fall back to a SHUFPS sequence.
11852     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
11853   }
11854 
11855   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
11856 }
11857 
11858 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
lowerV8I64VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11859 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11860                                        SDValue V1, SDValue V2,
11861                                        const X86Subtarget &Subtarget,
11862                                        SelectionDAG &DAG) {
11863   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11864   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11865   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11866 
11867   if (SDValue Shuf128 =
11868           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
11869     return Shuf128;
11870 
11871   if (V2.isUndef()) {
11872     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11873     // can use lower latency instructions that will operate on all four
11874     // 128-bit lanes.
11875     SmallVector<int, 2> Repeated128Mask;
11876     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
11877       SmallVector<int, 4> PSHUFDMask;
11878       scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
11879       return DAG.getBitcast(
11880           MVT::v8i64,
11881           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
11882                       DAG.getBitcast(MVT::v16i32, V1),
11883                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11884     }
11885 
11886     SmallVector<int, 4> Repeated256Mask;
11887     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
11888       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
11889                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
11890   }
11891 
11892   // Try to use shift instructions.
11893   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
11894                                                 Subtarget, DAG))
11895     return Shift;
11896 
11897   if (SDValue Unpck =
11898           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
11899     return Unpck;
11900 
11901   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
11902 }
11903 
11904 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
lowerV16I32VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11905 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11906                                         SDValue V1, SDValue V2,
11907                                         const X86Subtarget &Subtarget,
11908                                         SelectionDAG &DAG) {
11909   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11910   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11911   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11912 
11913   // If the shuffle mask is repeated in each 128-bit lane we can use more
11914   // efficient instructions that mirror the shuffles across the four 128-bit
11915   // lanes.
11916   SmallVector<int, 4> RepeatedMask;
11917   if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
11918     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11919     if (V2.isUndef())
11920       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
11921                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11922 
11923     // Use dedicated unpack instructions for masks that match their pattern.
11924     if (SDValue V =
11925             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
11926       return V;
11927   }
11928 
11929   // Try to use shift instructions.
11930   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
11931                                                 Subtarget, DAG))
11932     return Shift;
11933 
11934   // Try to use byte rotation instructions.
11935   if (Subtarget.hasBWI())
11936     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11937             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
11938       return Rotate;
11939 
11940   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
11941 }
11942 
11943 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
lowerV32I16VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11944 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11945                                         SDValue V1, SDValue V2,
11946                                         const X86Subtarget &Subtarget,
11947                                         SelectionDAG &DAG) {
11948   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11949   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11950   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11951   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11952 
11953   // Use dedicated unpack instructions for masks that match their pattern.
11954   if (SDValue V =
11955           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
11956     return V;
11957 
11958   // Try to use shift instructions.
11959   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
11960                                                 Subtarget, DAG))
11961     return Shift;
11962 
11963   // Try to use byte rotation instructions.
11964   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11965           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
11966     return Rotate;
11967 
11968   if (V2.isUndef()) {
11969     SmallVector<int, 8> RepeatedMask;
11970     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
11971       // As this is a single-input shuffle, the repeated mask should be
11972       // a strictly valid v8i16 mask that we can pass through to the v8i16
11973       // lowering to handle even the v32 case.
11974       return lowerV8I16GeneralSingleInputVectorShuffle(
11975           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
11976     }
11977   }
11978 
11979   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
11980 }
11981 
11982 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
lowerV64I8VectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)11983 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11984                                        SDValue V1, SDValue V2,
11985                                        const X86Subtarget &Subtarget,
11986                                        SelectionDAG &DAG) {
11987   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11988   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11989   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11990   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11991 
11992   // Use dedicated unpack instructions for masks that match their pattern.
11993   if (SDValue V =
11994           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
11995     return V;
11996 
11997   // Try to use shift instructions.
11998   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
11999                                                 Subtarget, DAG))
12000     return Shift;
12001 
12002   // Try to use byte rotation instructions.
12003   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12004           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12005     return Rotate;
12006 
12007   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
12008                                                     V2, Subtarget, DAG))
12009     return PSHUFB;
12010 
12011   // FIXME: Implement direct support for this type!
12012   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12013 }
12014 
12015 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12016 ///
12017 /// This routine either breaks down the specific type of a 512-bit x86 vector
12018 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
12019 /// together based on the available instructions.
lower512BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)12020 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12021                                         MVT VT, SDValue V1, SDValue V2,
12022                                         const X86Subtarget &Subtarget,
12023                                         SelectionDAG &DAG) {
12024   assert(Subtarget.hasAVX512() &&
12025          "Cannot lower 512-bit vectors w/ basic ISA!");
12026 
12027   // Check for being able to broadcast a single element.
12028   if (SDValue Broadcast =
12029           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
12030     return Broadcast;
12031 
12032   // Dispatch to each element type for lowering. If we don't have support for
12033   // specific element type shuffles at 512 bits, immediately split them and
12034   // lower them. Each lowering routine of a given type is allowed to assume that
12035   // the requisite ISA extensions for that element type are available.
12036   switch (VT.SimpleTy) {
12037   case MVT::v8f64:
12038     return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12039   case MVT::v16f32:
12040     return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12041   case MVT::v8i64:
12042     return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12043   case MVT::v16i32:
12044     return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12045   case MVT::v32i16:
12046     return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12047   case MVT::v64i8:
12048     return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12049 
12050   default:
12051     llvm_unreachable("Not a valid 512-bit x86 vector type!");
12052   }
12053 }
12054 
12055 // Lower vXi1 vector shuffles.
12056 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
12057 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
12058 // vector, shuffle and then truncate it back.
lower1BitVectorShuffle(const SDLoc & DL,ArrayRef<int> Mask,MVT VT,SDValue V1,SDValue V2,const X86Subtarget & Subtarget,SelectionDAG & DAG)12059 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12060                                       MVT VT, SDValue V1, SDValue V2,
12061                                       const X86Subtarget &Subtarget,
12062                                       SelectionDAG &DAG) {
12063   assert(Subtarget.hasAVX512() &&
12064          "Cannot lower 512-bit vectors w/o basic ISA!");
12065   MVT ExtVT;
12066   switch (VT.SimpleTy) {
12067   default:
12068     llvm_unreachable("Expected a vector of i1 elements");
12069   case MVT::v2i1:
12070     ExtVT = MVT::v2i64;
12071     break;
12072   case MVT::v4i1:
12073     ExtVT = MVT::v4i32;
12074     break;
12075   case MVT::v8i1:
12076     ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
12077     break;
12078   case MVT::v16i1:
12079     ExtVT = MVT::v16i32;
12080     break;
12081   case MVT::v32i1:
12082     ExtVT = MVT::v32i16;
12083     break;
12084   case MVT::v64i1:
12085     ExtVT = MVT::v64i8;
12086     break;
12087   }
12088 
12089   if (ISD::isBuildVectorAllZeros(V1.getNode()))
12090     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12091   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
12092     V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12093   else
12094     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
12095 
12096   if (V2.isUndef())
12097     V2 = DAG.getUNDEF(ExtVT);
12098   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
12099     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12100   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
12101     V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12102   else
12103     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
12104   return DAG.getNode(ISD::TRUNCATE, DL, VT,
12105                      DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
12106 }
12107 /// \brief Top-level lowering for x86 vector shuffles.
12108 ///
12109 /// This handles decomposition, canonicalization, and lowering of all x86
12110 /// vector shuffles. Most of the specific lowering strategies are encapsulated
12111 /// above in helper routines. The canonicalization attempts to widen shuffles
12112 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
12113 /// s.t. only one of the two inputs needs to be tested, etc.
lowerVectorShuffle(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12114 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
12115                                   SelectionDAG &DAG) {
12116   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12117   ArrayRef<int> Mask = SVOp->getMask();
12118   SDValue V1 = Op.getOperand(0);
12119   SDValue V2 = Op.getOperand(1);
12120   MVT VT = Op.getSimpleValueType();
12121   int NumElements = VT.getVectorNumElements();
12122   SDLoc DL(Op);
12123   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
12124 
12125   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
12126          "Can't lower MMX shuffles");
12127 
12128   bool V1IsUndef = V1.isUndef();
12129   bool V2IsUndef = V2.isUndef();
12130   if (V1IsUndef && V2IsUndef)
12131     return DAG.getUNDEF(VT);
12132 
12133   // When we create a shuffle node we put the UNDEF node to second operand,
12134   // but in some cases the first operand may be transformed to UNDEF.
12135   // In this case we should just commute the node.
12136   if (V1IsUndef)
12137     return DAG.getCommutedVectorShuffle(*SVOp);
12138 
12139   // Check for non-undef masks pointing at an undef vector and make the masks
12140   // undef as well. This makes it easier to match the shuffle based solely on
12141   // the mask.
12142   if (V2IsUndef)
12143     for (int M : Mask)
12144       if (M >= NumElements) {
12145         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
12146         for (int &M : NewMask)
12147           if (M >= NumElements)
12148             M = -1;
12149         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
12150       }
12151 
12152   // We actually see shuffles that are entirely re-arrangements of a set of
12153   // zero inputs. This mostly happens while decomposing complex shuffles into
12154   // simple ones. Directly lower these as a buildvector of zeros.
12155   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
12156   if (Zeroable.all())
12157     return getZeroVector(VT, Subtarget, DAG, DL);
12158 
12159   // Try to collapse shuffles into using a vector type with fewer elements but
12160   // wider element types. We cap this to not form integers or floating point
12161   // elements wider than 64 bits, but it might be interesting to form i128
12162   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
12163   SmallVector<int, 16> WidenedMask;
12164   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
12165       canWidenShuffleElements(Mask, WidenedMask)) {
12166     MVT NewEltVT = VT.isFloatingPoint()
12167                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
12168                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
12169     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12170     // Make sure that the new vector type is legal. For example, v2f64 isn't
12171     // legal on SSE1.
12172     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12173       V1 = DAG.getBitcast(NewVT, V1);
12174       V2 = DAG.getBitcast(NewVT, V2);
12175       return DAG.getBitcast(
12176           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
12177     }
12178   }
12179 
12180   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
12181   for (int M : Mask)
12182     if (M < 0)
12183       ++NumUndefElements;
12184     else if (M < NumElements)
12185       ++NumV1Elements;
12186     else
12187       ++NumV2Elements;
12188 
12189   // Commute the shuffle as needed such that more elements come from V1 than
12190   // V2. This allows us to match the shuffle pattern strictly on how many
12191   // elements come from V1 without handling the symmetric cases.
12192   if (NumV2Elements > NumV1Elements)
12193     return DAG.getCommutedVectorShuffle(*SVOp);
12194 
12195   assert(NumV1Elements > 0 && "No V1 indices");
12196   assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
12197 
12198   // When the number of V1 and V2 elements are the same, try to minimize the
12199   // number of uses of V2 in the low half of the vector. When that is tied,
12200   // ensure that the sum of indices for V1 is equal to or lower than the sum
12201   // indices for V2. When those are equal, try to ensure that the number of odd
12202   // indices for V1 is lower than the number of odd indices for V2.
12203   if (NumV1Elements == NumV2Elements) {
12204     int LowV1Elements = 0, LowV2Elements = 0;
12205     for (int M : Mask.slice(0, NumElements / 2))
12206       if (M >= NumElements)
12207         ++LowV2Elements;
12208       else if (M >= 0)
12209         ++LowV1Elements;
12210     if (LowV2Elements > LowV1Elements)
12211       return DAG.getCommutedVectorShuffle(*SVOp);
12212     if (LowV2Elements == LowV1Elements) {
12213       int SumV1Indices = 0, SumV2Indices = 0;
12214       for (int i = 0, Size = Mask.size(); i < Size; ++i)
12215         if (Mask[i] >= NumElements)
12216           SumV2Indices += i;
12217         else if (Mask[i] >= 0)
12218           SumV1Indices += i;
12219       if (SumV2Indices < SumV1Indices)
12220         return DAG.getCommutedVectorShuffle(*SVOp);
12221       if (SumV2Indices == SumV1Indices) {
12222         int NumV1OddIndices = 0, NumV2OddIndices = 0;
12223         for (int i = 0, Size = Mask.size(); i < Size; ++i)
12224           if (Mask[i] >= NumElements)
12225             NumV2OddIndices += i % 2;
12226           else if (Mask[i] >= 0)
12227             NumV1OddIndices += i % 2;
12228         if (NumV2OddIndices < NumV1OddIndices)
12229           return DAG.getCommutedVectorShuffle(*SVOp);
12230       }
12231     }
12232   }
12233 
12234   // For each vector width, delegate to a specialized lowering routine.
12235   if (VT.is128BitVector())
12236     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12237 
12238   if (VT.is256BitVector())
12239     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12240 
12241   if (VT.is512BitVector())
12242     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12243 
12244   if (Is1BitVector)
12245     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12246 
12247   llvm_unreachable("Unimplemented!");
12248 }
12249 
12250 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
lowerVSELECTtoVectorShuffle(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12251 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
12252                                            const X86Subtarget &Subtarget,
12253                                            SelectionDAG &DAG) {
12254   SDValue Cond = Op.getOperand(0);
12255   SDValue LHS = Op.getOperand(1);
12256   SDValue RHS = Op.getOperand(2);
12257   SDLoc dl(Op);
12258   MVT VT = Op.getSimpleValueType();
12259 
12260   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12261     return SDValue();
12262   auto *CondBV = cast<BuildVectorSDNode>(Cond);
12263 
12264   // Only non-legal VSELECTs reach this lowering, convert those into generic
12265   // shuffles and re-use the shuffle lowering path for blends.
12266   SmallVector<int, 32> Mask;
12267   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
12268     SDValue CondElt = CondBV->getOperand(i);
12269     Mask.push_back(
12270         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
12271                                      : -1);
12272   }
12273   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
12274 }
12275 
LowerVSELECT(SDValue Op,SelectionDAG & DAG) const12276 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12277   // A vselect where all conditions and data are constants can be optimized into
12278   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12279   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12280       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12281       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12282     return SDValue();
12283 
12284   // Try to lower this to a blend-style vector shuffle. This can handle all
12285   // constant condition cases.
12286   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
12287     return BlendOp;
12288 
12289   // Variable blends are only legal from SSE4.1 onward.
12290   if (!Subtarget.hasSSE41())
12291     return SDValue();
12292 
12293   // Only some types will be legal on some subtargets. If we can emit a legal
12294   // VSELECT-matching blend, return Op, and but if we need to expand, return
12295   // a null value.
12296   switch (Op.getSimpleValueType().SimpleTy) {
12297   default:
12298     // Most of the vector types have blends past SSE4.1.
12299     return Op;
12300 
12301   case MVT::v32i8:
12302     // The byte blends for AVX vectors were introduced only in AVX2.
12303     if (Subtarget.hasAVX2())
12304       return Op;
12305 
12306     return SDValue();
12307 
12308   case MVT::v8i16:
12309   case MVT::v16i16:
12310     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
12311     if (Subtarget.hasBWI() && Subtarget.hasVLX())
12312       return Op;
12313 
12314     // FIXME: We should custom lower this by fixing the condition and using i8
12315     // blends.
12316     return SDValue();
12317   }
12318 }
12319 
LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,SelectionDAG & DAG)12320 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12321   MVT VT = Op.getSimpleValueType();
12322   SDLoc dl(Op);
12323 
12324   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12325     return SDValue();
12326 
12327   if (VT.getSizeInBits() == 8) {
12328     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12329                                   Op.getOperand(0), Op.getOperand(1));
12330     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12331                                   DAG.getValueType(VT));
12332     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12333   }
12334 
12335   if (VT.getSizeInBits() == 16) {
12336     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12337     if (isNullConstant(Op.getOperand(1)))
12338       return DAG.getNode(
12339           ISD::TRUNCATE, dl, MVT::i16,
12340           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12341                       DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12342                       Op.getOperand(1)));
12343     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12344                                   Op.getOperand(0), Op.getOperand(1));
12345     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12346                                   DAG.getValueType(VT));
12347     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12348   }
12349 
12350   if (VT == MVT::f32) {
12351     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12352     // the result back to FR32 register. It's only worth matching if the
12353     // result has a single use which is a store or a bitcast to i32.  And in
12354     // the case of a store, it's not worth it if the index is a constant 0,
12355     // because a MOVSSmr can be used instead, which is smaller and faster.
12356     if (!Op.hasOneUse())
12357       return SDValue();
12358     SDNode *User = *Op.getNode()->use_begin();
12359     if ((User->getOpcode() != ISD::STORE ||
12360          isNullConstant(Op.getOperand(1))) &&
12361         (User->getOpcode() != ISD::BITCAST ||
12362          User->getValueType(0) != MVT::i32))
12363       return SDValue();
12364     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12365                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12366                                   Op.getOperand(1));
12367     return DAG.getBitcast(MVT::f32, Extract);
12368   }
12369 
12370   if (VT == MVT::i32 || VT == MVT::i64) {
12371     // ExtractPS/pextrq works with constant index.
12372     if (isa<ConstantSDNode>(Op.getOperand(1)))
12373       return Op;
12374   }
12375   return SDValue();
12376 }
12377 
12378 /// Extract one bit from mask vector, like v16i1 or v8i1.
12379 /// AVX-512 feature.
12380 SDValue
ExtractBitFromMaskVector(SDValue Op,SelectionDAG & DAG) const12381 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12382   SDValue Vec = Op.getOperand(0);
12383   SDLoc dl(Vec);
12384   MVT VecVT = Vec.getSimpleValueType();
12385   SDValue Idx = Op.getOperand(1);
12386   MVT EltVT = Op.getSimpleValueType();
12387 
12388   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12389   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
12390          "Unexpected vector type in ExtractBitFromMaskVector");
12391 
12392   // variable index can't be handled in mask registers,
12393   // extend vector to VR512
12394   if (!isa<ConstantSDNode>(Idx)) {
12395     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12396     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12397     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12398                               ExtVT.getVectorElementType(), Ext, Idx);
12399     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12400   }
12401 
12402   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12403   if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
12404     // Use kshiftlw/rw instruction.
12405     VecVT = MVT::v16i1;
12406     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
12407                       DAG.getUNDEF(VecVT),
12408                       Vec,
12409                       DAG.getIntPtrConstant(0, dl));
12410   }
12411   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
12412   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12413                     DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
12414   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12415                     DAG.getConstant(MaxSift, dl, MVT::i8));
12416   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12417                        DAG.getIntPtrConstant(0, dl));
12418 }
12419 
12420 SDValue
LowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const12421 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12422                                            SelectionDAG &DAG) const {
12423   SDLoc dl(Op);
12424   SDValue Vec = Op.getOperand(0);
12425   MVT VecVT = Vec.getSimpleValueType();
12426   SDValue Idx = Op.getOperand(1);
12427 
12428   if (Op.getSimpleValueType() == MVT::i1)
12429     return ExtractBitFromMaskVector(Op, DAG);
12430 
12431   if (!isa<ConstantSDNode>(Idx)) {
12432     if (VecVT.is512BitVector() ||
12433         (VecVT.is256BitVector() && Subtarget.hasInt256() &&
12434          VecVT.getVectorElementType().getSizeInBits() == 32)) {
12435 
12436       MVT MaskEltVT =
12437         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12438       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12439                                     MaskEltVT.getSizeInBits());
12440 
12441       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12442       auto PtrVT = getPointerTy(DAG.getDataLayout());
12443       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12444                                  getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
12445                                  DAG.getConstant(0, dl, PtrVT));
12446       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12447       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
12448                          DAG.getConstant(0, dl, PtrVT));
12449     }
12450     return SDValue();
12451   }
12452 
12453   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12454 
12455   // If this is a 256-bit vector result, first extract the 128-bit vector and
12456   // then extract the element from the 128-bit vector.
12457   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12458     // Get the 128-bit vector.
12459     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
12460     MVT EltVT = VecVT.getVectorElementType();
12461 
12462     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12463     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
12464 
12465     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
12466     // this can be done with a mask.
12467     IdxVal &= ElemsPerChunk - 1;
12468     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12469                        DAG.getConstant(IdxVal, dl, MVT::i32));
12470   }
12471 
12472   assert(VecVT.is128BitVector() && "Unexpected vector length");
12473 
12474   if (Subtarget.hasSSE41())
12475     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
12476       return Res;
12477 
12478   MVT VT = Op.getSimpleValueType();
12479   // TODO: handle v16i8.
12480   if (VT.getSizeInBits() == 16) {
12481     if (IdxVal == 0)
12482       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12483                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12484                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
12485 
12486     // Transform it so it match pextrw which produces a 32-bit result.
12487     MVT EltVT = MVT::i32;
12488     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
12489     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12490                                   DAG.getValueType(VT));
12491     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12492   }
12493 
12494   if (VT.getSizeInBits() == 32) {
12495     if (IdxVal == 0)
12496       return Op;
12497 
12498     // SHUFPS the element to the lowest double word, then movss.
12499     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
12500     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12501     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12502                        DAG.getIntPtrConstant(0, dl));
12503   }
12504 
12505   if (VT.getSizeInBits() == 64) {
12506     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12507     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12508     //        to match extract_elt for f64.
12509     if (IdxVal == 0)
12510       return Op;
12511 
12512     // UNPCKHPD the element to the lowest double word, then movsd.
12513     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12514     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12515     int Mask[2] = { 1, -1 };
12516     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12517     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12518                        DAG.getIntPtrConstant(0, dl));
12519   }
12520 
12521   return SDValue();
12522 }
12523 
12524 /// Insert one bit to mask vector, like v16i1 or v8i1.
12525 /// AVX-512 feature.
12526 SDValue
InsertBitToMaskVector(SDValue Op,SelectionDAG & DAG) const12527 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12528   SDLoc dl(Op);
12529   SDValue Vec = Op.getOperand(0);
12530   SDValue Elt = Op.getOperand(1);
12531   SDValue Idx = Op.getOperand(2);
12532   MVT VecVT = Vec.getSimpleValueType();
12533 
12534   if (!isa<ConstantSDNode>(Idx)) {
12535     // Non constant index. Extend source and destination,
12536     // insert element and then truncate the result.
12537     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12538     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
12539     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
12540       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
12541       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
12542     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
12543   }
12544 
12545   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12546   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
12547   if (IdxVal)
12548     EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12549                            DAG.getConstant(IdxVal, dl, MVT::i8));
12550   if (Vec.isUndef())
12551     return EltInVec;
12552   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
12553 }
12554 
LowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const12555 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12556                                                   SelectionDAG &DAG) const {
12557   MVT VT = Op.getSimpleValueType();
12558   MVT EltVT = VT.getVectorElementType();
12559   unsigned NumElts = VT.getVectorNumElements();
12560 
12561   if (EltVT == MVT::i1)
12562     return InsertBitToMaskVector(Op, DAG);
12563 
12564   SDLoc dl(Op);
12565   SDValue N0 = Op.getOperand(0);
12566   SDValue N1 = Op.getOperand(1);
12567   SDValue N2 = Op.getOperand(2);
12568   if (!isa<ConstantSDNode>(N2))
12569     return SDValue();
12570   auto *N2C = cast<ConstantSDNode>(N2);
12571   unsigned IdxVal = N2C->getZExtValue();
12572 
12573   // If we are clearing out a element, we do this more efficiently with a
12574   // blend shuffle than a costly integer insertion.
12575   // TODO: would other rematerializable values (e.g. allbits) benefit as well?
12576   // TODO: pre-SSE41 targets will tend to use bit masking - this could still
12577   // be beneficial if we are inserting several zeros and can combine the masks.
12578   if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
12579     SmallVector<int, 8> ClearMask;
12580     for (unsigned i = 0; i != NumElts; ++i)
12581       ClearMask.push_back(i == IdxVal ? i + NumElts : i);
12582     SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
12583     return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
12584   }
12585 
12586   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
12587   // into that, and then insert the subvector back into the result.
12588   if (VT.is256BitVector() || VT.is512BitVector()) {
12589     // With a 256-bit vector, we can insert into the zero element efficiently
12590     // using a blend if we have AVX or AVX2 and the right data type.
12591     if (VT.is256BitVector() && IdxVal == 0) {
12592       // TODO: It is worthwhile to cast integer to floating point and back
12593       // and incur a domain crossing penalty if that's what we'll end up
12594       // doing anyway after extracting to a 128-bit vector.
12595       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12596           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
12597         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
12598         N2 = DAG.getIntPtrConstant(1, dl);
12599         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
12600       }
12601     }
12602 
12603     // Get the desired 128-bit vector chunk.
12604     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
12605 
12606     // Insert the element into the desired chunk.
12607     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
12608     assert(isPowerOf2_32(NumEltsIn128));
12609     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
12610     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
12611 
12612     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
12613                     DAG.getConstant(IdxIn128, dl, MVT::i32));
12614 
12615     // Insert the changed part back into the bigger vector
12616     return insert128BitVector(N0, V, IdxVal, DAG, dl);
12617   }
12618   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
12619 
12620   if (Subtarget.hasSSE41()) {
12621     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
12622       unsigned Opc;
12623       if (VT == MVT::v8i16) {
12624         Opc = X86ISD::PINSRW;
12625       } else {
12626         assert(VT == MVT::v16i8);
12627         Opc = X86ISD::PINSRB;
12628       }
12629 
12630       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
12631       // argument.
12632       if (N1.getValueType() != MVT::i32)
12633         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12634       if (N2.getValueType() != MVT::i32)
12635         N2 = DAG.getIntPtrConstant(IdxVal, dl);
12636       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
12637     }
12638 
12639     if (EltVT == MVT::f32) {
12640       // Bits [7:6] of the constant are the source select. This will always be
12641       //   zero here. The DAG Combiner may combine an extract_elt index into
12642       //   these bits. For example (insert (extract, 3), 2) could be matched by
12643       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
12644       // Bits [5:4] of the constant are the destination select. This is the
12645       //   value of the incoming immediate.
12646       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
12647       //   combine either bitwise AND or insert of float 0.0 to set these bits.
12648 
12649       bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
12650       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
12651         // If this is an insertion of 32-bits into the low 32-bits of
12652         // a vector, we prefer to generate a blend with immediate rather
12653         // than an insertps. Blends are simpler operations in hardware and so
12654         // will always have equal or better performance than insertps.
12655         // But if optimizing for size and there's a load folding opportunity,
12656         // generate insertps because blendps does not have a 32-bit memory
12657         // operand form.
12658         N2 = DAG.getIntPtrConstant(1, dl);
12659         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12660         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
12661       }
12662       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
12663       // Create this as a scalar to vector..
12664       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12665       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
12666     }
12667 
12668     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
12669       // PINSR* works with constant index.
12670       return Op;
12671     }
12672   }
12673 
12674   if (EltVT == MVT::i8)
12675     return SDValue();
12676 
12677   if (EltVT.getSizeInBits() == 16) {
12678     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
12679     // as its second argument.
12680     if (N1.getValueType() != MVT::i32)
12681       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12682     if (N2.getValueType() != MVT::i32)
12683       N2 = DAG.getIntPtrConstant(IdxVal, dl);
12684     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
12685   }
12686   return SDValue();
12687 }
12688 
LowerSCALAR_TO_VECTOR(SDValue Op,SelectionDAG & DAG)12689 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
12690   SDLoc dl(Op);
12691   MVT OpVT = Op.getSimpleValueType();
12692 
12693   // If this is a 256-bit vector result, first insert into a 128-bit
12694   // vector and then insert into the 256-bit vector.
12695   if (!OpVT.is128BitVector()) {
12696     // Insert into a 128-bit vector.
12697     unsigned SizeFactor = OpVT.getSizeInBits()/128;
12698     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
12699                                  OpVT.getVectorNumElements() / SizeFactor);
12700 
12701     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
12702 
12703     // Insert the 128-bit vector.
12704     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
12705   }
12706 
12707   if (OpVT == MVT::v1i64 &&
12708       Op.getOperand(0).getValueType() == MVT::i64)
12709     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
12710 
12711   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
12712   assert(OpVT.is128BitVector() && "Expected an SSE type!");
12713   return DAG.getBitcast(
12714       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
12715 }
12716 
12717 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
12718 // a simple subregister reference or explicit instructions to grab
12719 // upper bits of a vector.
LowerEXTRACT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12720 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12721                                       SelectionDAG &DAG) {
12722   SDLoc dl(Op);
12723   SDValue In =  Op.getOperand(0);
12724   SDValue Idx = Op.getOperand(1);
12725   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12726   MVT ResVT   = Op.getSimpleValueType();
12727   MVT InVT    = In.getSimpleValueType();
12728 
12729   if (Subtarget.hasFp256()) {
12730     if (ResVT.is128BitVector() &&
12731         (InVT.is256BitVector() || InVT.is512BitVector()) &&
12732         isa<ConstantSDNode>(Idx)) {
12733       return extract128BitVector(In, IdxVal, DAG, dl);
12734     }
12735     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
12736         isa<ConstantSDNode>(Idx)) {
12737       return extract256BitVector(In, IdxVal, DAG, dl);
12738     }
12739   }
12740   return SDValue();
12741 }
12742 
12743 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
12744 // simple superregister reference or explicit instructions to insert
12745 // the upper bits of a vector.
LowerINSERT_SUBVECTOR(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)12746 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12747                                      SelectionDAG &DAG) {
12748   if (!Subtarget.hasAVX())
12749     return SDValue();
12750 
12751   SDLoc dl(Op);
12752   SDValue Vec = Op.getOperand(0);
12753   SDValue SubVec = Op.getOperand(1);
12754   SDValue Idx = Op.getOperand(2);
12755 
12756   if (!isa<ConstantSDNode>(Idx))
12757     return SDValue();
12758 
12759   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12760   MVT OpVT = Op.getSimpleValueType();
12761   MVT SubVecVT = SubVec.getSimpleValueType();
12762 
12763   // Fold two 16-byte subvector loads into one 32-byte load:
12764   // (insert_subvector (insert_subvector undef, (load addr), 0),
12765   //                   (load addr + 16), Elts/2)
12766   // --> load32 addr
12767   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
12768       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
12769       OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
12770     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
12771     if (Idx2 && Idx2->getZExtValue() == 0) {
12772       // If needed, look through bitcasts to get to the load.
12773       SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
12774       if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
12775         bool Fast;
12776         unsigned Alignment = FirstLd->getAlignment();
12777         unsigned AS = FirstLd->getAddressSpace();
12778         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
12779         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
12780                                     OpVT, AS, Alignment, &Fast) && Fast) {
12781           SDValue Ops[] = { SubVec2, SubVec };
12782           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
12783             return Ld;
12784         }
12785       }
12786     }
12787   }
12788 
12789   if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
12790       SubVecVT.is128BitVector())
12791     return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
12792 
12793   if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
12794     return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
12795 
12796   if (OpVT.getVectorElementType() == MVT::i1)
12797     return insert1BitVector(Op, DAG, Subtarget);
12798 
12799   return SDValue();
12800 }
12801 
12802 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
12803 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
12804 // one of the above mentioned nodes. It has to be wrapped because otherwise
12805 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
12806 // be used to form addressing mode. These wrapped nodes will be selected
12807 // into MOV32ri.
12808 SDValue
LowerConstantPool(SDValue Op,SelectionDAG & DAG) const12809 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
12810   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12811 
12812   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12813   // global base reg.
12814   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12815   unsigned WrapperKind = X86ISD::Wrapper;
12816   CodeModel::Model M = DAG.getTarget().getCodeModel();
12817 
12818   if (Subtarget.isPICStyleRIPRel() &&
12819       (M == CodeModel::Small || M == CodeModel::Kernel))
12820     WrapperKind = X86ISD::WrapperRIP;
12821 
12822   auto PtrVT = getPointerTy(DAG.getDataLayout());
12823   SDValue Result = DAG.getTargetConstantPool(
12824       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
12825   SDLoc DL(CP);
12826   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12827   // With PIC, the address is actually $g + Offset.
12828   if (OpFlag) {
12829     Result =
12830         DAG.getNode(ISD::ADD, DL, PtrVT,
12831                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12832   }
12833 
12834   return Result;
12835 }
12836 
LowerJumpTable(SDValue Op,SelectionDAG & DAG) const12837 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
12838   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12839 
12840   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12841   // global base reg.
12842   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12843   unsigned WrapperKind = X86ISD::Wrapper;
12844   CodeModel::Model M = DAG.getTarget().getCodeModel();
12845 
12846   if (Subtarget.isPICStyleRIPRel() &&
12847       (M == CodeModel::Small || M == CodeModel::Kernel))
12848     WrapperKind = X86ISD::WrapperRIP;
12849 
12850   auto PtrVT = getPointerTy(DAG.getDataLayout());
12851   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
12852   SDLoc DL(JT);
12853   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12854 
12855   // With PIC, the address is actually $g + Offset.
12856   if (OpFlag)
12857     Result =
12858         DAG.getNode(ISD::ADD, DL, PtrVT,
12859                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12860 
12861   return Result;
12862 }
12863 
12864 SDValue
LowerExternalSymbol(SDValue Op,SelectionDAG & DAG) const12865 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
12866   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
12867 
12868   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12869   // global base reg.
12870   const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
12871   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
12872   unsigned WrapperKind = X86ISD::Wrapper;
12873   CodeModel::Model M = DAG.getTarget().getCodeModel();
12874 
12875   if (Subtarget.isPICStyleRIPRel() &&
12876       (M == CodeModel::Small || M == CodeModel::Kernel))
12877     WrapperKind = X86ISD::WrapperRIP;
12878 
12879   auto PtrVT = getPointerTy(DAG.getDataLayout());
12880   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
12881 
12882   SDLoc DL(Op);
12883   Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12884 
12885   // With PIC, the address is actually $g + Offset.
12886   if (isPositionIndependent() && !Subtarget.is64Bit()) {
12887     Result =
12888         DAG.getNode(ISD::ADD, DL, PtrVT,
12889                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12890   }
12891 
12892   // For symbols that require a load from a stub to get the address, emit the
12893   // load.
12894   if (isGlobalStubReference(OpFlag))
12895     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
12896                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
12897                          false, false, false, 0);
12898 
12899   return Result;
12900 }
12901 
12902 SDValue
LowerBlockAddress(SDValue Op,SelectionDAG & DAG) const12903 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
12904   // Create the TargetBlockAddressAddress node.
12905   unsigned char OpFlags =
12906     Subtarget.classifyBlockAddressReference();
12907   CodeModel::Model M = DAG.getTarget().getCodeModel();
12908   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
12909   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
12910   SDLoc dl(Op);
12911   auto PtrVT = getPointerTy(DAG.getDataLayout());
12912   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
12913 
12914   if (Subtarget.isPICStyleRIPRel() &&
12915       (M == CodeModel::Small || M == CodeModel::Kernel))
12916     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12917   else
12918     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12919 
12920   // With PIC, the address is actually $g + Offset.
12921   if (isGlobalRelativeToPICBase(OpFlags)) {
12922     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12923                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12924   }
12925 
12926   return Result;
12927 }
12928 
LowerGlobalAddress(const GlobalValue * GV,const SDLoc & dl,int64_t Offset,SelectionDAG & DAG) const12929 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
12930                                               const SDLoc &dl, int64_t Offset,
12931                                               SelectionDAG &DAG) const {
12932   // Create the TargetGlobalAddress node, folding in the constant
12933   // offset if it is legal.
12934   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
12935   CodeModel::Model M = DAG.getTarget().getCodeModel();
12936   auto PtrVT = getPointerTy(DAG.getDataLayout());
12937   SDValue Result;
12938   if (OpFlags == X86II::MO_NO_FLAG &&
12939       X86::isOffsetSuitableForCodeModel(Offset, M)) {
12940     // A direct static reference to a global.
12941     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
12942     Offset = 0;
12943   } else {
12944     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
12945   }
12946 
12947   if (Subtarget.isPICStyleRIPRel() &&
12948       (M == CodeModel::Small || M == CodeModel::Kernel))
12949     Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12950   else
12951     Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12952 
12953   // With PIC, the address is actually $g + Offset.
12954   if (isGlobalRelativeToPICBase(OpFlags)) {
12955     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12956                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12957   }
12958 
12959   // For globals that require a load from a stub to get the address, emit the
12960   // load.
12961   if (isGlobalStubReference(OpFlags))
12962     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
12963                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
12964                          false, false, false, 0);
12965 
12966   // If there was a non-zero offset that we didn't fold, create an explicit
12967   // addition for it.
12968   if (Offset != 0)
12969     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
12970                          DAG.getConstant(Offset, dl, PtrVT));
12971 
12972   return Result;
12973 }
12974 
12975 SDValue
LowerGlobalAddress(SDValue Op,SelectionDAG & DAG) const12976 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
12977   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
12978   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
12979   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
12980 }
12981 
12982 static SDValue
GetTLSADDR(SelectionDAG & DAG,SDValue Chain,GlobalAddressSDNode * GA,SDValue * InFlag,const EVT PtrVT,unsigned ReturnReg,unsigned char OperandFlags,bool LocalDynamic=false)12983 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
12984            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
12985            unsigned char OperandFlags, bool LocalDynamic = false) {
12986   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
12987   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
12988   SDLoc dl(GA);
12989   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
12990                                            GA->getValueType(0),
12991                                            GA->getOffset(),
12992                                            OperandFlags);
12993 
12994   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
12995                                            : X86ISD::TLSADDR;
12996 
12997   if (InFlag) {
12998     SDValue Ops[] = { Chain,  TGA, *InFlag };
12999     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13000   } else {
13001     SDValue Ops[]  = { Chain, TGA };
13002     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13003   }
13004 
13005   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13006   MFI->setAdjustsStack(true);
13007   MFI->setHasCalls(true);
13008 
13009   SDValue Flag = Chain.getValue(1);
13010   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13011 }
13012 
13013 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13014 static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)13015 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13016                                 const EVT PtrVT) {
13017   SDValue InFlag;
13018   SDLoc dl(GA);  // ? function entry point might be better
13019   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13020                                    DAG.getNode(X86ISD::GlobalBaseReg,
13021                                                SDLoc(), PtrVT), InFlag);
13022   InFlag = Chain.getValue(1);
13023 
13024   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13025 }
13026 
13027 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13028 static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT)13029 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13030                                 const EVT PtrVT) {
13031   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13032                     X86::RAX, X86II::MO_TLSGD);
13033 }
13034 
LowerToTLSLocalDynamicModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,bool is64Bit)13035 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13036                                            SelectionDAG &DAG,
13037                                            const EVT PtrVT,
13038                                            bool is64Bit) {
13039   SDLoc dl(GA);
13040 
13041   // Get the start address of the TLS block for this module.
13042   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13043       .getInfo<X86MachineFunctionInfo>();
13044   MFI->incNumLocalDynamicTLSAccesses();
13045 
13046   SDValue Base;
13047   if (is64Bit) {
13048     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13049                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13050   } else {
13051     SDValue InFlag;
13052     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13053         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13054     InFlag = Chain.getValue(1);
13055     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13056                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13057   }
13058 
13059   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13060   // of Base.
13061 
13062   // Build x@dtpoff.
13063   unsigned char OperandFlags = X86II::MO_DTPOFF;
13064   unsigned WrapperKind = X86ISD::Wrapper;
13065   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13066                                            GA->getValueType(0),
13067                                            GA->getOffset(), OperandFlags);
13068   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13069 
13070   // Add x@dtpoff with the base.
13071   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13072 }
13073 
13074 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
LowerToTLSExecModel(GlobalAddressSDNode * GA,SelectionDAG & DAG,const EVT PtrVT,TLSModel::Model model,bool is64Bit,bool isPIC)13075 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13076                                    const EVT PtrVT, TLSModel::Model model,
13077                                    bool is64Bit, bool isPIC) {
13078   SDLoc dl(GA);
13079 
13080   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13081   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13082                                                          is64Bit ? 257 : 256));
13083 
13084   SDValue ThreadPointer =
13085       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
13086                   MachinePointerInfo(Ptr), false, false, false, 0);
13087 
13088   unsigned char OperandFlags = 0;
13089   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13090   // initialexec.
13091   unsigned WrapperKind = X86ISD::Wrapper;
13092   if (model == TLSModel::LocalExec) {
13093     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13094   } else if (model == TLSModel::InitialExec) {
13095     if (is64Bit) {
13096       OperandFlags = X86II::MO_GOTTPOFF;
13097       WrapperKind = X86ISD::WrapperRIP;
13098     } else {
13099       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13100     }
13101   } else {
13102     llvm_unreachable("Unexpected model");
13103   }
13104 
13105   // emit "addl x@ntpoff,%eax" (local exec)
13106   // or "addl x@indntpoff,%eax" (initial exec)
13107   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13108   SDValue TGA =
13109       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13110                                  GA->getOffset(), OperandFlags);
13111   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13112 
13113   if (model == TLSModel::InitialExec) {
13114     if (isPIC && !is64Bit) {
13115       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13116                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13117                            Offset);
13118     }
13119 
13120     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13121                          MachinePointerInfo::getGOT(DAG.getMachineFunction()),
13122                          false, false, false, 0);
13123   }
13124 
13125   // The address of the thread local variable is the add of the thread
13126   // pointer with the offset of the variable.
13127   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13128 }
13129 
13130 SDValue
LowerGlobalTLSAddress(SDValue Op,SelectionDAG & DAG) const13131 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13132 
13133   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13134 
13135   if (DAG.getTarget().Options.EmulatedTLS)
13136     return LowerToTLSEmulatedModel(GA, DAG);
13137 
13138   const GlobalValue *GV = GA->getGlobal();
13139   auto PtrVT = getPointerTy(DAG.getDataLayout());
13140   bool PositionIndependent = isPositionIndependent();
13141 
13142   if (Subtarget.isTargetELF()) {
13143     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13144     switch (model) {
13145       case TLSModel::GeneralDynamic:
13146         if (Subtarget.is64Bit())
13147           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
13148         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
13149       case TLSModel::LocalDynamic:
13150         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
13151                                            Subtarget.is64Bit());
13152       case TLSModel::InitialExec:
13153       case TLSModel::LocalExec:
13154         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
13155                                    PositionIndependent);
13156     }
13157     llvm_unreachable("Unknown TLS model.");
13158   }
13159 
13160   if (Subtarget.isTargetDarwin()) {
13161     // Darwin only has one model of TLS.  Lower to that.
13162     unsigned char OpFlag = 0;
13163     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
13164                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13165 
13166     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13167     // global base reg.
13168     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
13169     if (PIC32)
13170       OpFlag = X86II::MO_TLVP_PIC_BASE;
13171     else
13172       OpFlag = X86II::MO_TLVP;
13173     SDLoc DL(Op);
13174     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13175                                                 GA->getValueType(0),
13176                                                 GA->getOffset(), OpFlag);
13177     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
13178 
13179     // With PIC32, the address is actually $g + Offset.
13180     if (PIC32)
13181       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
13182                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13183                            Offset);
13184 
13185     // Lowering the machine isd will make sure everything is in the right
13186     // location.
13187     SDValue Chain = DAG.getEntryNode();
13188     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13189     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
13190     SDValue Args[] = { Chain, Offset };
13191     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13192     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
13193                                DAG.getIntPtrConstant(0, DL, true),
13194                                Chain.getValue(1), DL);
13195 
13196     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13197     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13198     MFI->setAdjustsStack(true);
13199 
13200     // And our return value (tls address) is in the standard call return value
13201     // location.
13202     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
13203     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
13204   }
13205 
13206   if (Subtarget.isTargetKnownWindowsMSVC() ||
13207       Subtarget.isTargetWindowsItanium() ||
13208       Subtarget.isTargetWindowsGNU()) {
13209     // Just use the implicit TLS architecture
13210     // Need to generate someting similar to:
13211     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13212     //                                  ; from TEB
13213     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13214     //   mov     rcx, qword [rdx+rcx*8]
13215     //   mov     eax, .tls$:tlsvar
13216     //   [rax+rcx] contains the address
13217     // Windows 64bit: gs:0x58
13218     // Windows 32bit: fs:__tls_array
13219 
13220     SDLoc dl(GA);
13221     SDValue Chain = DAG.getEntryNode();
13222 
13223     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13224     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13225     // use its literal value of 0x2C.
13226     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
13227                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13228                                                              256)
13229                                         : Type::getInt32PtrTy(*DAG.getContext(),
13230                                                               257));
13231 
13232     SDValue TlsArray = Subtarget.is64Bit()
13233                            ? DAG.getIntPtrConstant(0x58, dl)
13234                            : (Subtarget.isTargetWindowsGNU()
13235                                   ? DAG.getIntPtrConstant(0x2C, dl)
13236                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
13237 
13238     SDValue ThreadPointer =
13239         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
13240                     false, false, 0);
13241 
13242     SDValue res;
13243     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
13244       res = ThreadPointer;
13245     } else {
13246       // Load the _tls_index variable
13247       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
13248       if (Subtarget.is64Bit())
13249         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
13250                              MachinePointerInfo(), MVT::i32, false, false,
13251                              false, 0);
13252       else
13253         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
13254                           false, false, 0);
13255 
13256       auto &DL = DAG.getDataLayout();
13257       SDValue Scale =
13258           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
13259       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
13260 
13261       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
13262     }
13263 
13264     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
13265                       false, 0);
13266 
13267     // Get the offset of start of .tls section
13268     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13269                                              GA->getValueType(0),
13270                                              GA->getOffset(), X86II::MO_SECREL);
13271     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
13272 
13273     // The address of the thread local variable is the add of the thread
13274     // pointer with the offset of the variable.
13275     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
13276   }
13277 
13278   llvm_unreachable("TLS not implemented for this target.");
13279 }
13280 
13281 /// Lower SRA_PARTS and friends, which return two i32 values
13282 /// and take a 2 x i32 value to shift plus a shift amount.
LowerShiftParts(SDValue Op,SelectionDAG & DAG)13283 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13284   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13285   MVT VT = Op.getSimpleValueType();
13286   unsigned VTBits = VT.getSizeInBits();
13287   SDLoc dl(Op);
13288   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13289   SDValue ShOpLo = Op.getOperand(0);
13290   SDValue ShOpHi = Op.getOperand(1);
13291   SDValue ShAmt  = Op.getOperand(2);
13292   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13293   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13294   // during isel.
13295   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13296                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
13297   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13298                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
13299                        : DAG.getConstant(0, dl, VT);
13300 
13301   SDValue Tmp2, Tmp3;
13302   if (Op.getOpcode() == ISD::SHL_PARTS) {
13303     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13304     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13305   } else {
13306     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13307     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13308   }
13309 
13310   // If the shift amount is larger or equal than the width of a part we can't
13311   // rely on the results of shld/shrd. Insert a test and select the appropriate
13312   // values for large shift amounts.
13313   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13314                                 DAG.getConstant(VTBits, dl, MVT::i8));
13315   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13316                              AndNode, DAG.getConstant(0, dl, MVT::i8));
13317 
13318   SDValue Hi, Lo;
13319   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
13320   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13321   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13322 
13323   if (Op.getOpcode() == ISD::SHL_PARTS) {
13324     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13325     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13326   } else {
13327     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13328     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13329   }
13330 
13331   SDValue Ops[2] = { Lo, Hi };
13332   return DAG.getMergeValues(Ops, dl);
13333 }
13334 
LowerSINT_TO_FP(SDValue Op,SelectionDAG & DAG) const13335 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13336                                            SelectionDAG &DAG) const {
13337   SDValue Src = Op.getOperand(0);
13338   MVT SrcVT = Src.getSimpleValueType();
13339   MVT VT = Op.getSimpleValueType();
13340   SDLoc dl(Op);
13341 
13342   if (SrcVT.isVector()) {
13343     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
13344       return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
13345                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
13346                          DAG.getUNDEF(SrcVT)));
13347     }
13348     if (SrcVT.getVectorElementType() == MVT::i1) {
13349       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13350       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13351                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
13352     }
13353     return SDValue();
13354   }
13355 
13356   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13357          "Unknown SINT_TO_FP to lower!");
13358 
13359   // These are really Legal; return the operand so the caller accepts it as
13360   // Legal.
13361   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13362     return Op;
13363   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13364       Subtarget.is64Bit()) {
13365     return Op;
13366   }
13367 
13368   SDValue ValueToStore = Op.getOperand(0);
13369   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13370       !Subtarget.is64Bit())
13371     // Bitcasting to f64 here allows us to do a single 64-bit store from
13372     // an SSE register, avoiding the store forwarding penalty that would come
13373     // with two 32-bit stores.
13374     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13375 
13376   unsigned Size = SrcVT.getSizeInBits()/8;
13377   MachineFunction &MF = DAG.getMachineFunction();
13378   auto PtrVT = getPointerTy(MF.getDataLayout());
13379   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13380   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13381   SDValue Chain = DAG.getStore(
13382       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
13383       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
13384       false, 0);
13385   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13386 }
13387 
BuildFILD(SDValue Op,EVT SrcVT,SDValue Chain,SDValue StackSlot,SelectionDAG & DAG) const13388 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13389                                      SDValue StackSlot,
13390                                      SelectionDAG &DAG) const {
13391   // Build the FILD
13392   SDLoc DL(Op);
13393   SDVTList Tys;
13394   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13395   if (useSSE)
13396     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13397   else
13398     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13399 
13400   unsigned ByteSize = SrcVT.getSizeInBits()/8;
13401 
13402   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13403   MachineMemOperand *MMO;
13404   if (FI) {
13405     int SSFI = FI->getIndex();
13406     MMO = DAG.getMachineFunction().getMachineMemOperand(
13407         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13408         MachineMemOperand::MOLoad, ByteSize, ByteSize);
13409   } else {
13410     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13411     StackSlot = StackSlot.getOperand(1);
13412   }
13413   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13414   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13415                                            X86ISD::FILD, DL,
13416                                            Tys, Ops, SrcVT, MMO);
13417 
13418   if (useSSE) {
13419     Chain = Result.getValue(1);
13420     SDValue InFlag = Result.getValue(2);
13421 
13422     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13423     // shouldn't be necessary except that RFP cannot be live across
13424     // multiple blocks. When stackifier is fixed, they can be uncoupled.
13425     MachineFunction &MF = DAG.getMachineFunction();
13426     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13427     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13428     auto PtrVT = getPointerTy(MF.getDataLayout());
13429     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13430     Tys = DAG.getVTList(MVT::Other);
13431     SDValue Ops[] = {
13432       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13433     };
13434     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13435         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13436         MachineMemOperand::MOStore, SSFISize, SSFISize);
13437 
13438     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13439                                     Ops, Op.getValueType(), MMO);
13440     Result = DAG.getLoad(
13441         Op.getValueType(), DL, Chain, StackSlot,
13442         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13443         false, false, false, 0);
13444   }
13445 
13446   return Result;
13447 }
13448 
13449 /// 64-bit unsigned integer to double expansion.
LowerUINT_TO_FP_i64(SDValue Op,SelectionDAG & DAG) const13450 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13451                                                SelectionDAG &DAG) const {
13452   // This algorithm is not obvious. Here it is what we're trying to output:
13453   /*
13454      movq       %rax,  %xmm0
13455      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13456      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13457      #ifdef __SSE3__
13458        haddpd   %xmm0, %xmm0
13459      #else
13460        pshufd   $0x4e, %xmm0, %xmm1
13461        addpd    %xmm1, %xmm0
13462      #endif
13463   */
13464 
13465   SDLoc dl(Op);
13466   LLVMContext *Context = DAG.getContext();
13467 
13468   // Build some magic constants.
13469   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13470   Constant *C0 = ConstantDataVector::get(*Context, CV0);
13471   auto PtrVT = getPointerTy(DAG.getDataLayout());
13472   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
13473 
13474   SmallVector<Constant*,2> CV1;
13475   CV1.push_back(
13476     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13477                                       APInt(64, 0x4330000000000000ULL))));
13478   CV1.push_back(
13479     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13480                                       APInt(64, 0x4530000000000000ULL))));
13481   Constant *C1 = ConstantVector::get(CV1);
13482   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
13483 
13484   // Load the 64-bit value into an XMM register.
13485   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13486                             Op.getOperand(0));
13487   SDValue CLod0 =
13488       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13489                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13490                   false, false, false, 16);
13491   SDValue Unpck1 =
13492       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
13493 
13494   SDValue CLod1 =
13495       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13496                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13497                   false, false, false, 16);
13498   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
13499   // TODO: Are there any fast-math-flags to propagate here?
13500   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13501   SDValue Result;
13502 
13503   if (Subtarget.hasSSE3()) {
13504     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13505     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13506   } else {
13507     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
13508     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13509                                            S2F, 0x4E, DAG);
13510     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13511                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
13512   }
13513 
13514   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13515                      DAG.getIntPtrConstant(0, dl));
13516 }
13517 
13518 /// 32-bit unsigned integer to float expansion.
LowerUINT_TO_FP_i32(SDValue Op,SelectionDAG & DAG) const13519 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13520                                                SelectionDAG &DAG) const {
13521   SDLoc dl(Op);
13522   // FP constant to bias correct the final result.
13523   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
13524                                    MVT::f64);
13525 
13526   // Load the 32-bit value into an XMM register.
13527   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13528                              Op.getOperand(0));
13529 
13530   // Zero out the upper parts of the register.
13531   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13532 
13533   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13534                      DAG.getBitcast(MVT::v2f64, Load),
13535                      DAG.getIntPtrConstant(0, dl));
13536 
13537   // Or the load with the bias.
13538   SDValue Or = DAG.getNode(
13539       ISD::OR, dl, MVT::v2i64,
13540       DAG.getBitcast(MVT::v2i64,
13541                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
13542       DAG.getBitcast(MVT::v2i64,
13543                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
13544   Or =
13545       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13546                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
13547 
13548   // Subtract the bias.
13549   // TODO: Are there any fast-math-flags to propagate here?
13550   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13551 
13552   // Handle final rounding.
13553   MVT DestVT = Op.getSimpleValueType();
13554 
13555   if (DestVT.bitsLT(MVT::f64))
13556     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13557                        DAG.getIntPtrConstant(0, dl));
13558   if (DestVT.bitsGT(MVT::f64))
13559     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13560 
13561   // Handle final rounding.
13562   return Sub;
13563 }
13564 
lowerUINT_TO_FP_vXi32(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)13565 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13566                                      const X86Subtarget &Subtarget) {
13567   // The algorithm is the following:
13568   // #ifdef __SSE4_1__
13569   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13570   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13571   //                                 (uint4) 0x53000000, 0xaa);
13572   // #else
13573   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13574   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13575   // #endif
13576   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13577   //     return (float4) lo + fhi;
13578 
13579   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
13580   // reassociate the two FADDs, and if we do that, the algorithm fails
13581   // spectacularly (PR24512).
13582   // FIXME: If we ever have some kind of Machine FMF, this should be marked
13583   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
13584   // there's also the MachineCombiner reassociations happening on Machine IR.
13585   if (DAG.getTarget().Options.UnsafeFPMath)
13586     return SDValue();
13587 
13588   SDLoc DL(Op);
13589   SDValue V = Op->getOperand(0);
13590   MVT VecIntVT = V.getSimpleValueType();
13591   bool Is128 = VecIntVT == MVT::v4i32;
13592   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13593   // If we convert to something else than the supported type, e.g., to v4f64,
13594   // abort early.
13595   if (VecFloatVT != Op->getSimpleValueType(0))
13596     return SDValue();
13597 
13598   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13599          "Unsupported custom type");
13600 
13601   // In the #idef/#else code, we have in common:
13602   // - The vector of constants:
13603   // -- 0x4b000000
13604   // -- 0x53000000
13605   // - A shift:
13606   // -- v >> 16
13607 
13608   // Create the splat vector for 0x4b000000.
13609   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
13610   // Create the splat vector for 0x53000000.
13611   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
13612 
13613   // Create the right shift.
13614   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
13615   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
13616 
13617   SDValue Low, High;
13618   if (Subtarget.hasSSE41()) {
13619     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
13620     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13621     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
13622     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
13623     // Low will be bitcasted right away, so do not bother bitcasting back to its
13624     // original type.
13625     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
13626                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13627     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13628     //                                 (uint4) 0x53000000, 0xaa);
13629     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
13630     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
13631     // High will be bitcasted right away, so do not bother bitcasting back to
13632     // its original type.
13633     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
13634                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13635   } else {
13636     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
13637     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13638     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
13639     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
13640 
13641     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13642     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
13643   }
13644 
13645   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
13646   SDValue VecCstFAdd = DAG.getConstantFP(
13647       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
13648 
13649   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13650   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
13651   // TODO: Are there any fast-math-flags to propagate here?
13652   SDValue FHigh =
13653       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
13654   //     return (float4) lo + fhi;
13655   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
13656   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
13657 }
13658 
lowerUINT_TO_FP_vec(SDValue Op,SelectionDAG & DAG) const13659 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
13660                                                SelectionDAG &DAG) const {
13661   SDValue N0 = Op.getOperand(0);
13662   MVT SVT = N0.getSimpleValueType();
13663   SDLoc dl(Op);
13664 
13665   switch (SVT.SimpleTy) {
13666   default:
13667     llvm_unreachable("Custom UINT_TO_FP is not supported!");
13668   case MVT::v4i8:
13669   case MVT::v4i16:
13670   case MVT::v8i8:
13671   case MVT::v8i16: {
13672     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
13673     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13674                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
13675   }
13676   case MVT::v4i32:
13677   case MVT::v8i32:
13678     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
13679   case MVT::v16i8:
13680   case MVT::v16i16:
13681     assert(Subtarget.hasAVX512());
13682     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
13683                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
13684   }
13685 }
13686 
LowerUINT_TO_FP(SDValue Op,SelectionDAG & DAG) const13687 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
13688                                            SelectionDAG &DAG) const {
13689   SDValue N0 = Op.getOperand(0);
13690   SDLoc dl(Op);
13691   auto PtrVT = getPointerTy(DAG.getDataLayout());
13692 
13693   if (Op.getSimpleValueType().isVector())
13694     return lowerUINT_TO_FP_vec(Op, DAG);
13695 
13696   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
13697   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
13698   // the optimization here.
13699   if (DAG.SignBitIsZero(N0))
13700     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
13701 
13702   MVT SrcVT = N0.getSimpleValueType();
13703   MVT DstVT = Op.getSimpleValueType();
13704 
13705   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
13706       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
13707     // Conversions from unsigned i32 to f32/f64 are legal,
13708     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
13709     return Op;
13710   }
13711 
13712   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
13713     return LowerUINT_TO_FP_i64(Op, DAG);
13714   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
13715     return LowerUINT_TO_FP_i32(Op, DAG);
13716   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
13717     return SDValue();
13718 
13719   // Make a 64-bit buffer, and use it to build an FILD.
13720   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
13721   if (SrcVT == MVT::i32) {
13722     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
13723     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13724                                   StackSlot, MachinePointerInfo(),
13725                                   false, false, 0);
13726     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
13727                                   OffsetSlot, MachinePointerInfo(),
13728                                   false, false, 0);
13729     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
13730     return Fild;
13731   }
13732 
13733   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
13734   SDValue ValueToStore = Op.getOperand(0);
13735   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
13736     // Bitcasting to f64 here allows us to do a single 64-bit store from
13737     // an SSE register, avoiding the store forwarding penalty that would come
13738     // with two 32-bit stores.
13739     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13740   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
13741                                StackSlot, MachinePointerInfo(),
13742                                false, false, 0);
13743   // For i64 source, we need to add the appropriate power of 2 if the input
13744   // was negative.  This is the same as the optimization in
13745   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
13746   // we must be careful to do the computation in x87 extended precision, not
13747   // in SSE. (The generic code can't know it's OK to do this, or how to.)
13748   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
13749   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13750       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13751       MachineMemOperand::MOLoad, 8, 8);
13752 
13753   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
13754   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
13755   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
13756                                          MVT::i64, MMO);
13757 
13758   APInt FF(32, 0x5F800000ULL);
13759 
13760   // Check whether the sign bit is set.
13761   SDValue SignSet = DAG.getSetCC(
13762       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
13763       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
13764 
13765   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
13766   SDValue FudgePtr = DAG.getConstantPool(
13767       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
13768 
13769   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
13770   SDValue Zero = DAG.getIntPtrConstant(0, dl);
13771   SDValue Four = DAG.getIntPtrConstant(4, dl);
13772   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
13773                                Zero, Four);
13774   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
13775 
13776   // Load the value out, extending it from f32 to f80.
13777   // FIXME: Avoid the extend by constructing the right constant pool?
13778   SDValue Fudge = DAG.getExtLoad(
13779       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
13780       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
13781       false, false, false, 4);
13782   // Extend everything to 80 bits to force it to be done on x87.
13783   // TODO: Are there any fast-math-flags to propagate here?
13784   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
13785   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
13786                      DAG.getIntPtrConstant(0, dl));
13787 }
13788 
13789 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
13790 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
13791 // just return an <SDValue(), SDValue()> pair.
13792 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
13793 // to i16, i32 or i64, and we lower it to a legal sequence.
13794 // If lowered to the final integer result we return a <result, SDValue()> pair.
13795 // Otherwise we lower it to a sequence ending with a FIST, return a
13796 // <FIST, StackSlot> pair, and the caller is responsible for loading
13797 // the final integer result from StackSlot.
13798 std::pair<SDValue,SDValue>
FP_TO_INTHelper(SDValue Op,SelectionDAG & DAG,bool IsSigned,bool IsReplace) const13799 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
13800                                    bool IsSigned, bool IsReplace) const {
13801   SDLoc DL(Op);
13802 
13803   EVT DstTy = Op.getValueType();
13804   EVT TheVT = Op.getOperand(0).getValueType();
13805   auto PtrVT = getPointerTy(DAG.getDataLayout());
13806 
13807   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
13808     // f16 must be promoted before using the lowering in this routine.
13809     // fp128 does not use this lowering.
13810     return std::make_pair(SDValue(), SDValue());
13811   }
13812 
13813   // If using FIST to compute an unsigned i64, we'll need some fixup
13814   // to handle values above the maximum signed i64.  A FIST is always
13815   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
13816   bool UnsignedFixup = !IsSigned &&
13817                        DstTy == MVT::i64 &&
13818                        (!Subtarget.is64Bit() ||
13819                         !isScalarFPTypeInSSEReg(TheVT));
13820 
13821   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
13822     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
13823     // The low 32 bits of the fist result will have the correct uint32 result.
13824     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
13825     DstTy = MVT::i64;
13826   }
13827 
13828   assert(DstTy.getSimpleVT() <= MVT::i64 &&
13829          DstTy.getSimpleVT() >= MVT::i16 &&
13830          "Unknown FP_TO_INT to lower!");
13831 
13832   // These are really Legal.
13833   if (DstTy == MVT::i32 &&
13834       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13835     return std::make_pair(SDValue(), SDValue());
13836   if (Subtarget.is64Bit() &&
13837       DstTy == MVT::i64 &&
13838       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13839     return std::make_pair(SDValue(), SDValue());
13840 
13841   // We lower FP->int64 into FISTP64 followed by a load from a temporary
13842   // stack slot.
13843   MachineFunction &MF = DAG.getMachineFunction();
13844   unsigned MemSize = DstTy.getSizeInBits()/8;
13845   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13846   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13847 
13848   unsigned Opc;
13849   switch (DstTy.getSimpleVT().SimpleTy) {
13850   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
13851   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
13852   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
13853   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
13854   }
13855 
13856   SDValue Chain = DAG.getEntryNode();
13857   SDValue Value = Op.getOperand(0);
13858   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
13859 
13860   if (UnsignedFixup) {
13861     //
13862     // Conversion to unsigned i64 is implemented with a select,
13863     // depending on whether the source value fits in the range
13864     // of a signed i64.  Let Thresh be the FP equivalent of
13865     // 0x8000000000000000ULL.
13866     //
13867     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
13868     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
13869     //  Fist-to-mem64 FistSrc
13870     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
13871     //  to XOR'ing the high 32 bits with Adjust.
13872     //
13873     // Being a power of 2, Thresh is exactly representable in all FP formats.
13874     // For X87 we'd like to use the smallest FP type for this constant, but
13875     // for DAG type consistency we have to match the FP operand type.
13876 
13877     APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
13878     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
13879     bool LosesInfo = false;
13880     if (TheVT == MVT::f64)
13881       // The rounding mode is irrelevant as the conversion should be exact.
13882       Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
13883                               &LosesInfo);
13884     else if (TheVT == MVT::f80)
13885       Status = Thresh.convert(APFloat::x87DoubleExtended,
13886                               APFloat::rmNearestTiesToEven, &LosesInfo);
13887 
13888     assert(Status == APFloat::opOK && !LosesInfo &&
13889            "FP conversion should have been exact");
13890 
13891     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
13892 
13893     SDValue Cmp = DAG.getSetCC(DL,
13894                                getSetCCResultType(DAG.getDataLayout(),
13895                                                   *DAG.getContext(), TheVT),
13896                                Value, ThreshVal, ISD::SETLT);
13897     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
13898                            DAG.getConstant(0, DL, MVT::i32),
13899                            DAG.getConstant(0x80000000, DL, MVT::i32));
13900     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
13901     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
13902                                               *DAG.getContext(), TheVT),
13903                        Value, ThreshVal, ISD::SETLT);
13904     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
13905   }
13906 
13907   // FIXME This causes a redundant load/store if the SSE-class value is already
13908   // in memory, such as if it is on the callstack.
13909   if (isScalarFPTypeInSSEReg(TheVT)) {
13910     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
13911     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
13912                          MachinePointerInfo::getFixedStack(MF, SSFI), false,
13913                          false, 0);
13914     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
13915     SDValue Ops[] = {
13916       Chain, StackSlot, DAG.getValueType(TheVT)
13917     };
13918 
13919     MachineMemOperand *MMO =
13920         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13921                                 MachineMemOperand::MOLoad, MemSize, MemSize);
13922     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
13923     Chain = Value.getValue(1);
13924     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13925     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13926   }
13927 
13928   MachineMemOperand *MMO =
13929       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13930                               MachineMemOperand::MOStore, MemSize, MemSize);
13931 
13932   if (UnsignedFixup) {
13933 
13934     // Insert the FIST, load its result as two i32's,
13935     // and XOR the high i32 with Adjust.
13936 
13937     SDValue FistOps[] = { Chain, Value, StackSlot };
13938     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13939                                            FistOps, DstTy, MMO);
13940 
13941     SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
13942                                 MachinePointerInfo(),
13943                                 false, false, false, 0);
13944     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
13945 
13946     SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
13947                                  MachinePointerInfo(),
13948                                  false, false, false, 0);
13949     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
13950 
13951     if (Subtarget.is64Bit()) {
13952       // Join High32 and Low32 into a 64-bit result.
13953       // (High32 << 32) | Low32
13954       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
13955       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
13956       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
13957                            DAG.getConstant(32, DL, MVT::i8));
13958       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
13959       return std::make_pair(Result, SDValue());
13960     }
13961 
13962     SDValue ResultOps[] = { Low32, High32 };
13963 
13964     SDValue pair = IsReplace
13965       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
13966       : DAG.getMergeValues(ResultOps, DL);
13967     return std::make_pair(pair, SDValue());
13968   } else {
13969     // Build the FP_TO_INT*_IN_MEM
13970     SDValue Ops[] = { Chain, Value, StackSlot };
13971     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13972                                            Ops, DstTy, MMO);
13973     return std::make_pair(FIST, StackSlot);
13974   }
13975 }
13976 
LowerAVXExtend(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)13977 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
13978                               const X86Subtarget &Subtarget) {
13979   MVT VT = Op->getSimpleValueType(0);
13980   SDValue In = Op->getOperand(0);
13981   MVT InVT = In.getSimpleValueType();
13982   SDLoc dl(Op);
13983 
13984   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
13985     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
13986 
13987   // Optimize vectors in AVX mode:
13988   //
13989   //   v8i16 -> v8i32
13990   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
13991   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
13992   //   Concat upper and lower parts.
13993   //
13994   //   v4i32 -> v4i64
13995   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
13996   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
13997   //   Concat upper and lower parts.
13998   //
13999 
14000   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14001       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14002       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14003     return SDValue();
14004 
14005   if (Subtarget.hasInt256())
14006     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14007 
14008   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14009   SDValue Undef = DAG.getUNDEF(InVT);
14010   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14011   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14012   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14013 
14014   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14015                              VT.getVectorNumElements()/2);
14016 
14017   OpLo = DAG.getBitcast(HVT, OpLo);
14018   OpHi = DAG.getBitcast(HVT, OpHi);
14019 
14020   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14021 }
14022 
LowerZERO_EXTEND_AVX512(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14023 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14024                   const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14025   MVT VT = Op->getSimpleValueType(0);
14026   SDValue In = Op->getOperand(0);
14027   MVT InVT = In.getSimpleValueType();
14028   SDLoc DL(Op);
14029   unsigned int NumElts = VT.getVectorNumElements();
14030   if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
14031     return SDValue();
14032 
14033   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14034     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14035 
14036   assert(InVT.getVectorElementType() == MVT::i1);
14037 
14038   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
14039   MVT ExtVT = VT;
14040   if (!VT.is512BitVector() && !Subtarget.hasVLX())
14041     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14042 
14043   SDValue One =
14044    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
14045   SDValue Zero =
14046    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
14047 
14048   SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
14049   if (VT == ExtVT)
14050     return SelectedVal;
14051   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
14052 }
14053 
LowerANY_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14054 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14055                                SelectionDAG &DAG) {
14056   if (Subtarget.hasFp256())
14057     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14058       return Res;
14059 
14060   return SDValue();
14061 }
14062 
LowerZERO_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14063 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14064                                 SelectionDAG &DAG) {
14065   SDLoc DL(Op);
14066   MVT VT = Op.getSimpleValueType();
14067   SDValue In = Op.getOperand(0);
14068   MVT SVT = In.getSimpleValueType();
14069 
14070   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14071     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
14072 
14073   if (Subtarget.hasFp256())
14074     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14075       return Res;
14076 
14077   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14078          VT.getVectorNumElements() != SVT.getVectorNumElements());
14079   return SDValue();
14080 }
14081 
LowerTruncateVecI1(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)14082 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
14083                                   const X86Subtarget &Subtarget) {
14084 
14085   SDLoc DL(Op);
14086   MVT VT = Op.getSimpleValueType();
14087   SDValue In = Op.getOperand(0);
14088   MVT InVT = In.getSimpleValueType();
14089 
14090   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
14091 
14092   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
14093   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
14094   if (InVT.getScalarSizeInBits() <= 16) {
14095     if (Subtarget.hasBWI()) {
14096       // legal, will go to VPMOVB2M, VPMOVW2M
14097       // Shift packed bytes not supported natively, bitcast to word
14098       MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
14099       SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
14100                                        DAG.getBitcast(ExtVT, In),
14101                                        DAG.getConstant(ShiftInx, DL, ExtVT));
14102       ShiftNode = DAG.getBitcast(InVT, ShiftNode);
14103       return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
14104     }
14105     // Use TESTD/Q, extended vector to packed dword/qword.
14106     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
14107            "Unexpected vector type.");
14108     unsigned NumElts = InVT.getVectorNumElements();
14109     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14110     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14111     InVT = ExtVT;
14112     ShiftInx = InVT.getScalarSizeInBits() - 1;
14113   }
14114 
14115   SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
14116                                    DAG.getConstant(ShiftInx, DL, InVT));
14117   return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
14118 }
14119 
LowerTRUNCATE(SDValue Op,SelectionDAG & DAG) const14120 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14121   SDLoc DL(Op);
14122   MVT VT = Op.getSimpleValueType();
14123   SDValue In = Op.getOperand(0);
14124   MVT InVT = In.getSimpleValueType();
14125 
14126   if (VT == MVT::i1) {
14127     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14128            "Invalid scalar TRUNCATE operation");
14129     if (InVT.getSizeInBits() >= 32)
14130       return SDValue();
14131     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14132     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14133   }
14134   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14135          "Invalid TRUNCATE operation");
14136 
14137   if (VT.getVectorElementType() == MVT::i1)
14138     return LowerTruncateVecI1(Op, DAG, Subtarget);
14139 
14140   // vpmovqb/w/d, vpmovdb/w, vpmovwb
14141   if (Subtarget.hasAVX512()) {
14142     // word to byte only under BWI
14143     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
14144       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
14145                          DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
14146     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14147   }
14148   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14149     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14150     if (Subtarget.hasInt256()) {
14151       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14152       In = DAG.getBitcast(MVT::v8i32, In);
14153       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14154                                 ShufMask);
14155       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14156                          DAG.getIntPtrConstant(0, DL));
14157     }
14158 
14159     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14160                                DAG.getIntPtrConstant(0, DL));
14161     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14162                                DAG.getIntPtrConstant(2, DL));
14163     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14164     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14165     static const int ShufMask[] = {0, 2, 4, 6};
14166     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14167   }
14168 
14169   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14170     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14171     if (Subtarget.hasInt256()) {
14172       In = DAG.getBitcast(MVT::v32i8, In);
14173 
14174       SmallVector<SDValue,32> pshufbMask;
14175       for (unsigned i = 0; i < 2; ++i) {
14176         pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
14177         pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
14178         pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
14179         pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
14180         pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
14181         pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
14182         pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
14183         pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
14184         for (unsigned j = 0; j < 8; ++j)
14185           pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
14186       }
14187       SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
14188       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14189       In = DAG.getBitcast(MVT::v4i64, In);
14190 
14191       static const int ShufMask[] = {0,  2,  -1,  -1};
14192       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14193                                 ShufMask);
14194       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14195                        DAG.getIntPtrConstant(0, DL));
14196       return DAG.getBitcast(VT, In);
14197     }
14198 
14199     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14200                                DAG.getIntPtrConstant(0, DL));
14201 
14202     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14203                                DAG.getIntPtrConstant(4, DL));
14204 
14205     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
14206     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
14207 
14208     // The PSHUFB mask:
14209     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14210                                    -1, -1, -1, -1, -1, -1, -1, -1};
14211 
14212     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14213     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14214     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14215 
14216     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14217     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14218 
14219     // The MOVLHPS Mask:
14220     static const int ShufMask2[] = {0, 1, 4, 5};
14221     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14222     return DAG.getBitcast(MVT::v8i16, res);
14223   }
14224 
14225   // Handle truncation of V256 to V128 using shuffles.
14226   if (!VT.is128BitVector() || !InVT.is256BitVector())
14227     return SDValue();
14228 
14229   assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
14230 
14231   unsigned NumElems = VT.getVectorNumElements();
14232   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14233 
14234   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14235   // Prepare truncation shuffle mask
14236   for (unsigned i = 0; i != NumElems; ++i)
14237     MaskVec[i] = i * 2;
14238   SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
14239                                    DAG.getUNDEF(NVT), MaskVec);
14240   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14241                      DAG.getIntPtrConstant(0, DL));
14242 }
14243 
LowerFP_TO_SINT(SDValue Op,SelectionDAG & DAG) const14244 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14245                                            SelectionDAG &DAG) const {
14246   assert(!Op.getSimpleValueType().isVector());
14247 
14248   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14249     /*IsSigned=*/ true, /*IsReplace=*/ false);
14250   SDValue FIST = Vals.first, StackSlot = Vals.second;
14251   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14252   if (!FIST.getNode())
14253     return Op;
14254 
14255   if (StackSlot.getNode())
14256     // Load the result.
14257     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14258                        FIST, StackSlot, MachinePointerInfo(),
14259                        false, false, false, 0);
14260 
14261   // The node is the result.
14262   return FIST;
14263 }
14264 
LowerFP_TO_UINT(SDValue Op,SelectionDAG & DAG) const14265 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14266                                            SelectionDAG &DAG) const {
14267   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14268     /*IsSigned=*/ false, /*IsReplace=*/ false);
14269   SDValue FIST = Vals.first, StackSlot = Vals.second;
14270   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14271   if (!FIST.getNode())
14272     return Op;
14273 
14274   if (StackSlot.getNode())
14275     // Load the result.
14276     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14277                        FIST, StackSlot, MachinePointerInfo(),
14278                        false, false, false, 0);
14279 
14280   // The node is the result.
14281   return FIST;
14282 }
14283 
LowerFP_EXTEND(SDValue Op,SelectionDAG & DAG)14284 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14285   SDLoc DL(Op);
14286   MVT VT = Op.getSimpleValueType();
14287   SDValue In = Op.getOperand(0);
14288   MVT SVT = In.getSimpleValueType();
14289 
14290   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14291 
14292   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14293                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14294                                  In, DAG.getUNDEF(SVT)));
14295 }
14296 
14297 /// The only differences between FABS and FNEG are the mask and the logic op.
14298 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
LowerFABSorFNEG(SDValue Op,SelectionDAG & DAG)14299 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14300   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14301          "Wrong opcode for lowering FABS or FNEG.");
14302 
14303   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14304 
14305   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14306   // into an FNABS. We'll lower the FABS after that if it is still in use.
14307   if (IsFABS)
14308     for (SDNode *User : Op->uses())
14309       if (User->getOpcode() == ISD::FNEG)
14310         return Op;
14311 
14312   SDLoc dl(Op);
14313   MVT VT = Op.getSimpleValueType();
14314 
14315   bool IsF128 = (VT == MVT::f128);
14316 
14317   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14318   // decide if we should generate a 16-byte constant mask when we only need 4 or
14319   // 8 bytes for the scalar case.
14320 
14321   MVT LogicVT;
14322   MVT EltVT;
14323   unsigned NumElts;
14324 
14325   if (VT.isVector()) {
14326     LogicVT = VT;
14327     EltVT = VT.getVectorElementType();
14328     NumElts = VT.getVectorNumElements();
14329   } else if (IsF128) {
14330     // SSE instructions are used for optimized f128 logical operations.
14331     LogicVT = MVT::f128;
14332     EltVT = VT;
14333     NumElts = 1;
14334   } else {
14335     // There are no scalar bitwise logical SSE/AVX instructions, so we
14336     // generate a 16-byte vector constant and logic op even for the scalar case.
14337     // Using a 16-byte mask allows folding the load of the mask with
14338     // the logic op, so it can save (~4 bytes) on code size.
14339     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
14340     EltVT = VT;
14341     NumElts = (VT == MVT::f64) ? 2 : 4;
14342   }
14343 
14344   unsigned EltBits = EltVT.getSizeInBits();
14345   LLVMContext *Context = DAG.getContext();
14346   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14347   APInt MaskElt =
14348     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14349   Constant *C = ConstantInt::get(*Context, MaskElt);
14350   C = ConstantVector::getSplat(NumElts, C);
14351   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14352   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
14353   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14354   SDValue Mask =
14355       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14356                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14357                   false, false, false, Alignment);
14358 
14359   SDValue Op0 = Op.getOperand(0);
14360   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14361   unsigned LogicOp =
14362     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14363   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14364 
14365   if (VT.isVector() || IsF128)
14366     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14367 
14368   // For the scalar case extend to a 128-bit vector, perform the logic op,
14369   // and extract the scalar result back out.
14370   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
14371   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14372   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
14373                      DAG.getIntPtrConstant(0, dl));
14374 }
14375 
LowerFCOPYSIGN(SDValue Op,SelectionDAG & DAG)14376 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14377   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14378   LLVMContext *Context = DAG.getContext();
14379   SDValue Op0 = Op.getOperand(0);
14380   SDValue Op1 = Op.getOperand(1);
14381   SDLoc dl(Op);
14382   MVT VT = Op.getSimpleValueType();
14383   MVT SrcVT = Op1.getSimpleValueType();
14384   bool IsF128 = (VT == MVT::f128);
14385 
14386   // If second operand is smaller, extend it first.
14387   if (SrcVT.bitsLT(VT)) {
14388     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14389     SrcVT = VT;
14390   }
14391   // And if it is bigger, shrink it first.
14392   if (SrcVT.bitsGT(VT)) {
14393     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
14394     SrcVT = VT;
14395   }
14396 
14397   // At this point the operands and the result should have the same
14398   // type, and that won't be f80 since that is not custom lowered.
14399   assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
14400          "Unexpected type in LowerFCOPYSIGN");
14401 
14402   const fltSemantics &Sem =
14403       VT == MVT::f64 ? APFloat::IEEEdouble :
14404           (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
14405   const unsigned SizeInBits = VT.getSizeInBits();
14406 
14407   SmallVector<Constant *, 4> CV(
14408       VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
14409       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14410 
14411   // First, clear all bits but the sign bit from the second operand (sign).
14412   CV[0] = ConstantFP::get(*Context,
14413                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14414   Constant *C = ConstantVector::get(CV);
14415   auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
14416   SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14417 
14418   // Perform all logic operations as 16-byte vectors because there are no
14419   // scalar FP logic instructions in SSE. This allows load folding of the
14420   // constants into the logic instructions.
14421   MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
14422   SDValue Mask1 =
14423       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14424                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14425                   false, false, false, 16);
14426   if (!IsF128)
14427     Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
14428   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
14429 
14430   // Next, clear the sign bit from the first operand (magnitude).
14431   // If it's a constant, we can clear it here.
14432   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14433     APFloat APF = Op0CN->getValueAPF();
14434     // If the magnitude is a positive zero, the sign bit alone is enough.
14435     if (APF.isPosZero())
14436       return IsF128 ? SignBit :
14437           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
14438                       DAG.getIntPtrConstant(0, dl));
14439     APF.clearSign();
14440     CV[0] = ConstantFP::get(*Context, APF);
14441   } else {
14442     CV[0] = ConstantFP::get(
14443         *Context,
14444         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14445   }
14446   C = ConstantVector::get(CV);
14447   CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14448   SDValue Val =
14449       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14450                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14451                   false, false, false, 16);
14452   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14453   if (!isa<ConstantFPSDNode>(Op0)) {
14454     if (!IsF128)
14455       Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
14456     Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
14457   }
14458   // OR the magnitude value with the sign bit.
14459   Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
14460   return IsF128 ? Val :
14461       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
14462                   DAG.getIntPtrConstant(0, dl));
14463 }
14464 
LowerFGETSIGN(SDValue Op,SelectionDAG & DAG)14465 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14466   SDValue N0 = Op.getOperand(0);
14467   SDLoc dl(Op);
14468   MVT VT = Op.getSimpleValueType();
14469 
14470   MVT OpVT = N0.getSimpleValueType();
14471   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
14472          "Unexpected type for FGETSIGN");
14473 
14474   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
14475   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
14476   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
14477   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
14478   Res = DAG.getZExtOrTrunc(Res, dl, VT);
14479   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
14480   return Res;
14481 }
14482 
14483 // Check whether an OR'd tree is PTEST-able.
LowerVectorAllZeroTest(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)14484 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
14485                                       SelectionDAG &DAG) {
14486   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14487 
14488   if (!Subtarget.hasSSE41())
14489     return SDValue();
14490 
14491   if (!Op->hasOneUse())
14492     return SDValue();
14493 
14494   SDNode *N = Op.getNode();
14495   SDLoc DL(N);
14496 
14497   SmallVector<SDValue, 8> Opnds;
14498   DenseMap<SDValue, unsigned> VecInMap;
14499   SmallVector<SDValue, 8> VecIns;
14500   EVT VT = MVT::Other;
14501 
14502   // Recognize a special case where a vector is casted into wide integer to
14503   // test all 0s.
14504   Opnds.push_back(N->getOperand(0));
14505   Opnds.push_back(N->getOperand(1));
14506 
14507   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14508     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14509     // BFS traverse all OR'd operands.
14510     if (I->getOpcode() == ISD::OR) {
14511       Opnds.push_back(I->getOperand(0));
14512       Opnds.push_back(I->getOperand(1));
14513       // Re-evaluate the number of nodes to be traversed.
14514       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14515       continue;
14516     }
14517 
14518     // Quit if a non-EXTRACT_VECTOR_ELT
14519     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14520       return SDValue();
14521 
14522     // Quit if without a constant index.
14523     SDValue Idx = I->getOperand(1);
14524     if (!isa<ConstantSDNode>(Idx))
14525       return SDValue();
14526 
14527     SDValue ExtractedFromVec = I->getOperand(0);
14528     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14529     if (M == VecInMap.end()) {
14530       VT = ExtractedFromVec.getValueType();
14531       // Quit if not 128/256-bit vector.
14532       if (!VT.is128BitVector() && !VT.is256BitVector())
14533         return SDValue();
14534       // Quit if not the same type.
14535       if (VecInMap.begin() != VecInMap.end() &&
14536           VT != VecInMap.begin()->first.getValueType())
14537         return SDValue();
14538       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14539       VecIns.push_back(ExtractedFromVec);
14540     }
14541     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14542   }
14543 
14544   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14545          "Not extracted from 128-/256-bit vector.");
14546 
14547   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14548 
14549   for (DenseMap<SDValue, unsigned>::const_iterator
14550         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14551     // Quit if not all elements are used.
14552     if (I->second != FullMask)
14553       return SDValue();
14554   }
14555 
14556   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14557 
14558   // Cast all vectors into TestVT for PTEST.
14559   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14560     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
14561 
14562   // If more than one full vectors are evaluated, OR them first before PTEST.
14563   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14564     // Each iteration will OR 2 nodes and append the result until there is only
14565     // 1 node left, i.e. the final OR'd value of all vectors.
14566     SDValue LHS = VecIns[Slot];
14567     SDValue RHS = VecIns[Slot + 1];
14568     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14569   }
14570 
14571   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14572                      VecIns.back(), VecIns.back());
14573 }
14574 
14575 /// \brief return true if \c Op has a use that doesn't just read flags.
hasNonFlagsUse(SDValue Op)14576 static bool hasNonFlagsUse(SDValue Op) {
14577   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14578        ++UI) {
14579     SDNode *User = *UI;
14580     unsigned UOpNo = UI.getOperandNo();
14581     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14582       // Look pass truncate.
14583       UOpNo = User->use_begin().getOperandNo();
14584       User = *User->use_begin();
14585     }
14586 
14587     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14588         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14589       return true;
14590   }
14591   return false;
14592 }
14593 
14594 // Emit KTEST instruction for bit vectors on AVX-512
EmitKTEST(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)14595 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
14596                          const X86Subtarget &Subtarget) {
14597   if (Op.getOpcode() == ISD::BITCAST) {
14598     auto hasKTEST = [&](MVT VT) {
14599       unsigned SizeInBits = VT.getSizeInBits();
14600       return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
14601         (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
14602     };
14603     SDValue Op0 = Op.getOperand(0);
14604     MVT Op0VT = Op0.getValueType().getSimpleVT();
14605     if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
14606         hasKTEST(Op0VT))
14607       return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
14608   }
14609   return SDValue();
14610 }
14611 
14612 /// Emit nodes that will be selected as "test Op0,Op0", or something
14613 /// equivalent.
EmitTest(SDValue Op,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG) const14614 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
14615                                     SelectionDAG &DAG) const {
14616   if (Op.getValueType() == MVT::i1) {
14617     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
14618     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
14619                        DAG.getConstant(0, dl, MVT::i8));
14620   }
14621   // CF and OF aren't always set the way we want. Determine which
14622   // of these we need.
14623   bool NeedCF = false;
14624   bool NeedOF = false;
14625   switch (X86CC) {
14626   default: break;
14627   case X86::COND_A: case X86::COND_AE:
14628   case X86::COND_B: case X86::COND_BE:
14629     NeedCF = true;
14630     break;
14631   case X86::COND_G: case X86::COND_GE:
14632   case X86::COND_L: case X86::COND_LE:
14633   case X86::COND_O: case X86::COND_NO: {
14634     // Check if we really need to set the
14635     // Overflow flag. If NoSignedWrap is present
14636     // that is not actually needed.
14637     switch (Op->getOpcode()) {
14638     case ISD::ADD:
14639     case ISD::SUB:
14640     case ISD::MUL:
14641     case ISD::SHL: {
14642       const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
14643       if (BinNode->Flags.hasNoSignedWrap())
14644         break;
14645     }
14646     default:
14647       NeedOF = true;
14648       break;
14649     }
14650     break;
14651   }
14652   }
14653   // See if we can use the EFLAGS value from the operand instead of
14654   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14655   // we prove that the arithmetic won't overflow, we can't use OF or CF.
14656   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14657     // Emit KTEST for bit vectors
14658     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14659       return Node;
14660     // Emit a CMP with 0, which is the TEST pattern.
14661     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14662                        DAG.getConstant(0, dl, Op.getValueType()));
14663   }
14664   unsigned Opcode = 0;
14665   unsigned NumOperands = 0;
14666 
14667   // Truncate operations may prevent the merge of the SETCC instruction
14668   // and the arithmetic instruction before it. Attempt to truncate the operands
14669   // of the arithmetic instruction and use a reduced bit-width instruction.
14670   bool NeedTruncation = false;
14671   SDValue ArithOp = Op;
14672   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14673     SDValue Arith = Op->getOperand(0);
14674     // Both the trunc and the arithmetic op need to have one user each.
14675     if (Arith->hasOneUse())
14676       switch (Arith.getOpcode()) {
14677         default: break;
14678         case ISD::ADD:
14679         case ISD::SUB:
14680         case ISD::AND:
14681         case ISD::OR:
14682         case ISD::XOR: {
14683           NeedTruncation = true;
14684           ArithOp = Arith;
14685         }
14686       }
14687   }
14688 
14689   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14690   // which may be the result of a CAST.  We use the variable 'Op', which is the
14691   // non-casted variable when we check for possible users.
14692   switch (ArithOp.getOpcode()) {
14693   case ISD::ADD:
14694     // Due to an isel shortcoming, be conservative if this add is likely to be
14695     // selected as part of a load-modify-store instruction. When the root node
14696     // in a match is a store, isel doesn't know how to remap non-chain non-flag
14697     // uses of other nodes in the match, such as the ADD in this case. This
14698     // leads to the ADD being left around and reselected, with the result being
14699     // two adds in the output.  Alas, even if none our users are stores, that
14700     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
14701     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
14702     // climbing the DAG back to the root, and it doesn't seem to be worth the
14703     // effort.
14704     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14705          UE = Op.getNode()->use_end(); UI != UE; ++UI)
14706       if (UI->getOpcode() != ISD::CopyToReg &&
14707           UI->getOpcode() != ISD::SETCC &&
14708           UI->getOpcode() != ISD::STORE)
14709         goto default_case;
14710 
14711     if (ConstantSDNode *C =
14712         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14713       // An add of one will be selected as an INC.
14714       if (C->isOne() && !Subtarget.slowIncDec()) {
14715         Opcode = X86ISD::INC;
14716         NumOperands = 1;
14717         break;
14718       }
14719 
14720       // An add of negative one (subtract of one) will be selected as a DEC.
14721       if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
14722         Opcode = X86ISD::DEC;
14723         NumOperands = 1;
14724         break;
14725       }
14726     }
14727 
14728     // Otherwise use a regular EFLAGS-setting add.
14729     Opcode = X86ISD::ADD;
14730     NumOperands = 2;
14731     break;
14732   case ISD::SHL:
14733   case ISD::SRL:
14734     // If we have a constant logical shift that's only used in a comparison
14735     // against zero turn it into an equivalent AND. This allows turning it into
14736     // a TEST instruction later.
14737     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14738         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14739       EVT VT = Op.getValueType();
14740       unsigned BitWidth = VT.getSizeInBits();
14741       unsigned ShAmt = Op->getConstantOperandVal(1);
14742       if (ShAmt >= BitWidth) // Avoid undefined shifts.
14743         break;
14744       APInt Mask = ArithOp.getOpcode() == ISD::SRL
14745                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14746                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14747       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14748         break;
14749       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14750                        DAG.getConstant(Mask, dl, VT));
14751     }
14752     break;
14753 
14754   case ISD::AND:
14755     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
14756     // because a TEST instruction will be better.
14757     if (!hasNonFlagsUse(Op)) {
14758       SDValue Op0 = ArithOp->getOperand(0);
14759       SDValue Op1 = ArithOp->getOperand(1);
14760       EVT VT = ArithOp.getValueType();
14761       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
14762       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
14763 
14764       // But if we can combine this into an ANDN operation, then create an AND
14765       // now and allow it to be pattern matched into an ANDN.
14766       if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
14767         break;
14768     }
14769     // FALL THROUGH
14770   case ISD::SUB:
14771   case ISD::OR:
14772   case ISD::XOR:
14773     // Due to the ISEL shortcoming noted above, be conservative if this op is
14774     // likely to be selected as part of a load-modify-store instruction.
14775     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14776            UE = Op.getNode()->use_end(); UI != UE; ++UI)
14777       if (UI->getOpcode() == ISD::STORE)
14778         goto default_case;
14779 
14780     // Otherwise use a regular EFLAGS-setting instruction.
14781     switch (ArithOp.getOpcode()) {
14782     default: llvm_unreachable("unexpected operator!");
14783     case ISD::SUB: Opcode = X86ISD::SUB; break;
14784     case ISD::XOR: Opcode = X86ISD::XOR; break;
14785     case ISD::AND: Opcode = X86ISD::AND; break;
14786     case ISD::OR: {
14787       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14788         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
14789           return EFLAGS;
14790       }
14791       Opcode = X86ISD::OR;
14792       break;
14793     }
14794     }
14795 
14796     NumOperands = 2;
14797     break;
14798   case X86ISD::ADD:
14799   case X86ISD::SUB:
14800   case X86ISD::INC:
14801   case X86ISD::DEC:
14802   case X86ISD::OR:
14803   case X86ISD::XOR:
14804   case X86ISD::AND:
14805     return SDValue(Op.getNode(), 1);
14806   default:
14807   default_case:
14808     break;
14809   }
14810 
14811   // If we found that truncation is beneficial, perform the truncation and
14812   // update 'Op'.
14813   if (NeedTruncation) {
14814     EVT VT = Op.getValueType();
14815     SDValue WideVal = Op->getOperand(0);
14816     EVT WideVT = WideVal.getValueType();
14817     unsigned ConvertedOp = 0;
14818     // Use a target machine opcode to prevent further DAGCombine
14819     // optimizations that may separate the arithmetic operations
14820     // from the setcc node.
14821     switch (WideVal.getOpcode()) {
14822       default: break;
14823       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
14824       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
14825       case ISD::AND: ConvertedOp = X86ISD::AND; break;
14826       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
14827       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
14828     }
14829 
14830     if (ConvertedOp) {
14831       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14832       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
14833         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
14834         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
14835         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
14836       }
14837     }
14838   }
14839 
14840   if (Opcode == 0) {
14841     // Emit KTEST for bit vectors
14842     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14843       return Node;
14844 
14845     // Emit a CMP with 0, which is the TEST pattern.
14846     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14847                        DAG.getConstant(0, dl, Op.getValueType()));
14848   }
14849   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
14850   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
14851 
14852   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
14853   DAG.ReplaceAllUsesWith(Op, New);
14854   return SDValue(New.getNode(), 1);
14855 }
14856 
14857 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
14858 /// equivalent.
EmitCmp(SDValue Op0,SDValue Op1,unsigned X86CC,const SDLoc & dl,SelectionDAG & DAG) const14859 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
14860                                    const SDLoc &dl, SelectionDAG &DAG) const {
14861   if (isNullConstant(Op1))
14862     return EmitTest(Op0, X86CC, dl, DAG);
14863 
14864   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
14865          "Unexpected comparison operation for MVT::i1 operands");
14866 
14867   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
14868        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
14869     // Only promote the compare up to I32 if it is a 16 bit operation
14870     // with an immediate.  16 bit immediates are to be avoided.
14871     if ((Op0.getValueType() == MVT::i16 &&
14872          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
14873         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
14874         !Subtarget.isAtom()) {
14875       unsigned ExtendOp =
14876           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
14877       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
14878       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
14879     }
14880     // Use SUB instead of CMP to enable CSE between SUB and CMP.
14881     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
14882     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
14883                               Op0, Op1);
14884     return SDValue(Sub.getNode(), 1);
14885   }
14886   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
14887 }
14888 
14889 /// Convert a comparison if required by the subtarget.
ConvertCmpIfNecessary(SDValue Cmp,SelectionDAG & DAG) const14890 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
14891                                                  SelectionDAG &DAG) const {
14892   // If the subtarget does not support the FUCOMI instruction, floating-point
14893   // comparisons have to be converted.
14894   if (Subtarget.hasCMov() ||
14895       Cmp.getOpcode() != X86ISD::CMP ||
14896       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
14897       !Cmp.getOperand(1).getValueType().isFloatingPoint())
14898     return Cmp;
14899 
14900   // The instruction selector will select an FUCOM instruction instead of
14901   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
14902   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
14903   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
14904   SDLoc dl(Cmp);
14905   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
14906   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
14907   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
14908                             DAG.getConstant(8, dl, MVT::i8));
14909   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
14910 
14911   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
14912   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
14913   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
14914 }
14915 
14916 /// The minimum architected relative accuracy is 2^-12. We need one
14917 /// Newton-Raphson step to have a good float result (24 bits of precision).
getRsqrtEstimate(SDValue Op,DAGCombinerInfo & DCI,unsigned & RefinementSteps,bool & UseOneConstNR) const14918 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
14919                                             DAGCombinerInfo &DCI,
14920                                             unsigned &RefinementSteps,
14921                                             bool &UseOneConstNR) const {
14922   EVT VT = Op.getValueType();
14923   const char *RecipOp;
14924 
14925   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
14926   // TODO: Add support for AVX512 (v16f32).
14927   // It is likely not profitable to do this for f64 because a double-precision
14928   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
14929   // instructions: convert to single, rsqrtss, convert back to double, refine
14930   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
14931   // along with FMA, this could be a throughput win.
14932   if (VT == MVT::f32 && Subtarget.hasSSE1())
14933     RecipOp = "sqrtf";
14934   else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14935            (VT == MVT::v8f32 && Subtarget.hasAVX()))
14936     RecipOp = "vec-sqrtf";
14937   else
14938     return SDValue();
14939 
14940   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14941   if (!Recips.isEnabled(RecipOp))
14942     return SDValue();
14943 
14944   RefinementSteps = Recips.getRefinementSteps(RecipOp);
14945   UseOneConstNR = false;
14946   return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
14947 }
14948 
14949 /// The minimum architected relative accuracy is 2^-12. We need one
14950 /// Newton-Raphson step to have a good float result (24 bits of precision).
getRecipEstimate(SDValue Op,DAGCombinerInfo & DCI,unsigned & RefinementSteps) const14951 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
14952                                             DAGCombinerInfo &DCI,
14953                                             unsigned &RefinementSteps) const {
14954   EVT VT = Op.getValueType();
14955   const char *RecipOp;
14956 
14957   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
14958   // TODO: Add support for AVX512 (v16f32).
14959   // It is likely not profitable to do this for f64 because a double-precision
14960   // reciprocal estimate with refinement on x86 prior to FMA requires
14961   // 15 instructions: convert to single, rcpss, convert back to double, refine
14962   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
14963   // along with FMA, this could be a throughput win.
14964   if (VT == MVT::f32 && Subtarget.hasSSE1())
14965     RecipOp = "divf";
14966   else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14967            (VT == MVT::v8f32 && Subtarget.hasAVX()))
14968     RecipOp = "vec-divf";
14969   else
14970     return SDValue();
14971 
14972   TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14973   if (!Recips.isEnabled(RecipOp))
14974     return SDValue();
14975 
14976   RefinementSteps = Recips.getRefinementSteps(RecipOp);
14977   return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
14978 }
14979 
14980 /// If we have at least two divisions that use the same divisor, convert to
14981 /// multplication by a reciprocal. This may need to be adjusted for a given
14982 /// CPU if a division's cost is not at least twice the cost of a multiplication.
14983 /// This is because we still need one division to calculate the reciprocal and
14984 /// then we need two multiplies by that reciprocal as replacements for the
14985 /// original divisions.
combineRepeatedFPDivisors() const14986 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
14987   return 2;
14988 }
14989 
14990 /// Result of 'and' is compared against zero. Change to a BT node if possible.
LowerToBT(SDValue And,ISD::CondCode CC,const SDLoc & dl,SelectionDAG & DAG) const14991 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
14992                                      const SDLoc &dl, SelectionDAG &DAG) const {
14993   SDValue Op0 = And.getOperand(0);
14994   SDValue Op1 = And.getOperand(1);
14995   if (Op0.getOpcode() == ISD::TRUNCATE)
14996     Op0 = Op0.getOperand(0);
14997   if (Op1.getOpcode() == ISD::TRUNCATE)
14998     Op1 = Op1.getOperand(0);
14999 
15000   SDValue LHS, RHS;
15001   if (Op1.getOpcode() == ISD::SHL)
15002     std::swap(Op0, Op1);
15003   if (Op0.getOpcode() == ISD::SHL) {
15004     if (isOneConstant(Op0.getOperand(0))) {
15005       // If we looked past a truncate, check that it's only truncating away
15006       // known zeros.
15007       unsigned BitWidth = Op0.getValueSizeInBits();
15008       unsigned AndBitWidth = And.getValueSizeInBits();
15009       if (BitWidth > AndBitWidth) {
15010         APInt Zeros, Ones;
15011         DAG.computeKnownBits(Op0, Zeros, Ones);
15012         if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15013           return SDValue();
15014       }
15015       LHS = Op1;
15016       RHS = Op0.getOperand(1);
15017     }
15018   } else if (Op1.getOpcode() == ISD::Constant) {
15019     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15020     uint64_t AndRHSVal = AndRHS->getZExtValue();
15021     SDValue AndLHS = Op0;
15022 
15023     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15024       LHS = AndLHS.getOperand(0);
15025       RHS = AndLHS.getOperand(1);
15026     }
15027 
15028     // Use BT if the immediate can't be encoded in a TEST instruction.
15029     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15030       LHS = AndLHS;
15031       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
15032     }
15033   }
15034 
15035   if (LHS.getNode()) {
15036     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15037     // instruction.  Since the shift amount is in-range-or-undefined, we know
15038     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15039     // the encoding for the i16 version is larger than the i32 version.
15040     // Also promote i16 to i32 for performance / code size reason.
15041     if (LHS.getValueType() == MVT::i8 ||
15042         LHS.getValueType() == MVT::i16)
15043       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15044 
15045     // If the operand types disagree, extend the shift amount to match.  Since
15046     // BT ignores high bits (like shifts) we can use anyextend.
15047     if (LHS.getValueType() != RHS.getValueType())
15048       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15049 
15050     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15051     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15052     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15053                        DAG.getConstant(Cond, dl, MVT::i8), BT);
15054   }
15055 
15056   return SDValue();
15057 }
15058 
15059 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
15060 /// CMPs.
translateX86FSETCC(ISD::CondCode SetCCOpcode,SDValue & Op0,SDValue & Op1)15061 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15062                               SDValue &Op1) {
15063   unsigned SSECC;
15064   bool Swap = false;
15065 
15066   // SSE Condition code mapping:
15067   //  0 - EQ
15068   //  1 - LT
15069   //  2 - LE
15070   //  3 - UNORD
15071   //  4 - NEQ
15072   //  5 - NLT
15073   //  6 - NLE
15074   //  7 - ORD
15075   switch (SetCCOpcode) {
15076   default: llvm_unreachable("Unexpected SETCC condition");
15077   case ISD::SETOEQ:
15078   case ISD::SETEQ:  SSECC = 0; break;
15079   case ISD::SETOGT:
15080   case ISD::SETGT:  Swap = true; // Fallthrough
15081   case ISD::SETLT:
15082   case ISD::SETOLT: SSECC = 1; break;
15083   case ISD::SETOGE:
15084   case ISD::SETGE:  Swap = true; // Fallthrough
15085   case ISD::SETLE:
15086   case ISD::SETOLE: SSECC = 2; break;
15087   case ISD::SETUO:  SSECC = 3; break;
15088   case ISD::SETUNE:
15089   case ISD::SETNE:  SSECC = 4; break;
15090   case ISD::SETULE: Swap = true; // Fallthrough
15091   case ISD::SETUGE: SSECC = 5; break;
15092   case ISD::SETULT: Swap = true; // Fallthrough
15093   case ISD::SETUGT: SSECC = 6; break;
15094   case ISD::SETO:   SSECC = 7; break;
15095   case ISD::SETUEQ:
15096   case ISD::SETONE: SSECC = 8; break;
15097   }
15098   if (Swap)
15099     std::swap(Op0, Op1);
15100 
15101   return SSECC;
15102 }
15103 
15104 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
15105 /// concatenate the result back.
Lower256IntVSETCC(SDValue Op,SelectionDAG & DAG)15106 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15107   MVT VT = Op.getSimpleValueType();
15108 
15109   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15110          "Unsupported value type for operation");
15111 
15112   unsigned NumElems = VT.getVectorNumElements();
15113   SDLoc dl(Op);
15114   SDValue CC = Op.getOperand(2);
15115 
15116   // Extract the LHS vectors
15117   SDValue LHS = Op.getOperand(0);
15118   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
15119   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
15120 
15121   // Extract the RHS vectors
15122   SDValue RHS = Op.getOperand(1);
15123   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
15124   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
15125 
15126   // Issue the operation on the smaller types and concatenate the result back
15127   MVT EltVT = VT.getVectorElementType();
15128   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15129   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15130                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15131                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15132 }
15133 
LowerBoolVSETCC_AVX512(SDValue Op,SelectionDAG & DAG)15134 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15135   SDValue Op0 = Op.getOperand(0);
15136   SDValue Op1 = Op.getOperand(1);
15137   SDValue CC = Op.getOperand(2);
15138   MVT VT = Op.getSimpleValueType();
15139   SDLoc dl(Op);
15140 
15141   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15142          "Unexpected type for boolean compare operation");
15143   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15144   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
15145                                DAG.getConstant(-1, dl, VT));
15146   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
15147                                DAG.getConstant(-1, dl, VT));
15148   switch (SetCCOpcode) {
15149   default: llvm_unreachable("Unexpected SETCC condition");
15150   case ISD::SETEQ:
15151     // (x == y) -> ~(x ^ y)
15152     return DAG.getNode(ISD::XOR, dl, VT,
15153                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
15154                        DAG.getConstant(-1, dl, VT));
15155   case ISD::SETNE:
15156     // (x != y) -> (x ^ y)
15157     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
15158   case ISD::SETUGT:
15159   case ISD::SETGT:
15160     // (x > y) -> (x & ~y)
15161     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
15162   case ISD::SETULT:
15163   case ISD::SETLT:
15164     // (x < y) -> (~x & y)
15165     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
15166   case ISD::SETULE:
15167   case ISD::SETLE:
15168     // (x <= y) -> (~x | y)
15169     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
15170   case ISD::SETUGE:
15171   case ISD::SETGE:
15172     // (x >=y) -> (x | ~y)
15173     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
15174   }
15175 }
15176 
LowerIntVSETCC_AVX512(SDValue Op,SelectionDAG & DAG)15177 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15178 
15179   SDValue Op0 = Op.getOperand(0);
15180   SDValue Op1 = Op.getOperand(1);
15181   SDValue CC = Op.getOperand(2);
15182   MVT VT = Op.getSimpleValueType();
15183   SDLoc dl(Op);
15184 
15185   assert(VT.getVectorElementType() == MVT::i1 &&
15186          "Cannot set masked compare for this operation");
15187 
15188   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15189   unsigned  Opc = 0;
15190   bool Unsigned = false;
15191   bool Swap = false;
15192   unsigned SSECC;
15193   switch (SetCCOpcode) {
15194   default: llvm_unreachable("Unexpected SETCC condition");
15195   case ISD::SETNE:  SSECC = 4; break;
15196   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15197   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15198   case ISD::SETLT:  Swap = true; //fall-through
15199   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15200   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15201   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15202   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15203   case ISD::SETULE: Unsigned = true; //fall-through
15204   case ISD::SETLE:  SSECC = 2; break;
15205   }
15206 
15207   if (Swap)
15208     std::swap(Op0, Op1);
15209   if (Opc)
15210     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15211   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15212   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15213                      DAG.getConstant(SSECC, dl, MVT::i8));
15214 }
15215 
15216 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15217 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15218 /// return an empty value.
ChangeVSETULTtoVSETULE(const SDLoc & dl,SDValue Op1,SelectionDAG & DAG)15219 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
15220                                       SelectionDAG &DAG) {
15221   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15222   if (!BV)
15223     return SDValue();
15224 
15225   MVT VT = Op1.getSimpleValueType();
15226   MVT EVT = VT.getVectorElementType();
15227   unsigned n = VT.getVectorNumElements();
15228   SmallVector<SDValue, 8> ULTOp1;
15229 
15230   for (unsigned i = 0; i < n; ++i) {
15231     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15232     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
15233       return SDValue();
15234 
15235     // Avoid underflow.
15236     APInt Val = Elt->getAPIntValue();
15237     if (Val == 0)
15238       return SDValue();
15239 
15240     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
15241   }
15242 
15243   return DAG.getBuildVector(VT, dl, ULTOp1);
15244 }
15245 
LowerVSETCC(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)15246 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
15247                            SelectionDAG &DAG) {
15248   SDValue Op0 = Op.getOperand(0);
15249   SDValue Op1 = Op.getOperand(1);
15250   SDValue CC = Op.getOperand(2);
15251   MVT VT = Op.getSimpleValueType();
15252   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15253   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15254   SDLoc dl(Op);
15255 
15256   if (isFP) {
15257 #ifndef NDEBUG
15258     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15259     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15260 #endif
15261 
15262     unsigned Opc;
15263     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15264       assert(VT.getVectorNumElements() <= 16);
15265       Opc = X86ISD::CMPM;
15266     } else {
15267       Opc = X86ISD::CMPP;
15268       // The SSE/AVX packed FP comparison nodes are defined with a
15269       // floating-point vector result that matches the operand type. This allows
15270       // them to work with an SSE1 target (integer vector types are not legal).
15271       VT = Op0.getSimpleValueType();
15272     }
15273 
15274     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
15275     // emit two comparisons and a logic op to tie them together.
15276     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
15277     // available.
15278     SDValue Cmp;
15279     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15280     if (SSECC == 8) {
15281       // LLVM predicate is SETUEQ or SETONE.
15282       unsigned CC0, CC1;
15283       unsigned CombineOpc;
15284       if (SetCCOpcode == ISD::SETUEQ) {
15285         CC0 = 3; // UNORD
15286         CC1 = 0; // EQ
15287         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
15288                                            static_cast<unsigned>(ISD::OR);
15289       } else {
15290         assert(SetCCOpcode == ISD::SETONE);
15291         CC0 = 7; // ORD
15292         CC1 = 4; // NEQ
15293         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
15294                                            static_cast<unsigned>(ISD::AND);
15295       }
15296 
15297       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15298                                  DAG.getConstant(CC0, dl, MVT::i8));
15299       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15300                                  DAG.getConstant(CC1, dl, MVT::i8));
15301       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15302     } else {
15303       // Handle all other FP comparisons here.
15304       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
15305                         DAG.getConstant(SSECC, dl, MVT::i8));
15306     }
15307 
15308     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
15309     // result type of SETCC. The bitcast is expected to be optimized away
15310     // during combining/isel.
15311     if (Opc == X86ISD::CMPP)
15312       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
15313 
15314     return Cmp;
15315   }
15316 
15317   MVT VTOp0 = Op0.getSimpleValueType();
15318   assert(VTOp0 == Op1.getSimpleValueType() &&
15319          "Expected operands with same type!");
15320   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
15321          "Invalid number of packed elements for source and destination!");
15322 
15323   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
15324     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
15325     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
15326     // legalizer firstly checks if the first operand in input to the setcc has
15327     // a legal type. If so, then it promotes the return type to that same type.
15328     // Otherwise, the return type is promoted to the 'next legal type' which,
15329     // for a vector of MVT::i1 is always a 128-bit integer vector type.
15330     //
15331     // We reach this code only if the following two conditions are met:
15332     // 1. Both return type and operand type have been promoted to wider types
15333     //    by the type legalizer.
15334     // 2. The original operand type has been promoted to a 256-bit vector.
15335     //
15336     // Note that condition 2. only applies for AVX targets.
15337     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
15338     return DAG.getZExtOrTrunc(NewOp, dl, VT);
15339   }
15340 
15341   // The non-AVX512 code below works under the assumption that source and
15342   // destination types are the same.
15343   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
15344          "Value types for source and destination must be the same!");
15345 
15346   // Break 256-bit integer vector compare into smaller ones.
15347   if (VT.is256BitVector() && !Subtarget.hasInt256())
15348     return Lower256IntVSETCC(Op, DAG);
15349 
15350   // Operands are boolean (vectors of i1)
15351   MVT OpVT = Op1.getSimpleValueType();
15352   if (OpVT.getVectorElementType() == MVT::i1)
15353     return LowerBoolVSETCC_AVX512(Op, DAG);
15354 
15355   // The result is boolean, but operands are int/float
15356   if (VT.getVectorElementType() == MVT::i1) {
15357     // In AVX-512 architecture setcc returns mask with i1 elements,
15358     // But there is no compare instruction for i8 and i16 elements in KNL.
15359     // In this case use SSE compare
15360     bool UseAVX512Inst =
15361       (OpVT.is512BitVector() ||
15362        OpVT.getVectorElementType().getSizeInBits() >= 32 ||
15363        (Subtarget.hasBWI() && Subtarget.hasVLX()));
15364 
15365     if (UseAVX512Inst)
15366       return LowerIntVSETCC_AVX512(Op, DAG);
15367 
15368     return DAG.getNode(ISD::TRUNCATE, dl, VT,
15369                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15370   }
15371 
15372   // Lower using XOP integer comparisons.
15373   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
15374        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
15375     // Translate compare code to XOP PCOM compare mode.
15376     unsigned CmpMode = 0;
15377     switch (SetCCOpcode) {
15378     default: llvm_unreachable("Unexpected SETCC condition");
15379     case ISD::SETULT:
15380     case ISD::SETLT: CmpMode = 0x00; break;
15381     case ISD::SETULE:
15382     case ISD::SETLE: CmpMode = 0x01; break;
15383     case ISD::SETUGT:
15384     case ISD::SETGT: CmpMode = 0x02; break;
15385     case ISD::SETUGE:
15386     case ISD::SETGE: CmpMode = 0x03; break;
15387     case ISD::SETEQ: CmpMode = 0x04; break;
15388     case ISD::SETNE: CmpMode = 0x05; break;
15389     }
15390 
15391     // Are we comparing unsigned or signed integers?
15392     unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
15393       ? X86ISD::VPCOMU : X86ISD::VPCOM;
15394 
15395     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15396                        DAG.getConstant(CmpMode, dl, MVT::i8));
15397   }
15398 
15399   // We are handling one of the integer comparisons here.  Since SSE only has
15400   // GT and EQ comparisons for integer, swapping operands and multiple
15401   // operations may be required for some comparisons.
15402   unsigned Opc;
15403   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15404   bool Subus = false;
15405 
15406   switch (SetCCOpcode) {
15407   default: llvm_unreachable("Unexpected SETCC condition");
15408   case ISD::SETNE:  Invert = true;
15409   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15410   case ISD::SETLT:  Swap = true;
15411   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15412   case ISD::SETGE:  Swap = true;
15413   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15414                     Invert = true; break;
15415   case ISD::SETULT: Swap = true;
15416   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15417                     FlipSigns = true; break;
15418   case ISD::SETUGE: Swap = true;
15419   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15420                     FlipSigns = true; Invert = true; break;
15421   }
15422 
15423   // Special case: Use min/max operations for SETULE/SETUGE
15424   MVT VET = VT.getVectorElementType();
15425   bool hasMinMax =
15426        (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15427     || (Subtarget.hasSSE2()  && (VET == MVT::i8));
15428 
15429   if (hasMinMax) {
15430     switch (SetCCOpcode) {
15431     default: break;
15432     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
15433     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
15434     }
15435 
15436     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15437   }
15438 
15439   bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15440   if (!MinMax && hasSubus) {
15441     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15442     // Op0 u<= Op1:
15443     //   t = psubus Op0, Op1
15444     //   pcmpeq t, <0..0>
15445     switch (SetCCOpcode) {
15446     default: break;
15447     case ISD::SETULT: {
15448       // If the comparison is against a constant we can turn this into a
15449       // setule.  With psubus, setule does not require a swap.  This is
15450       // beneficial because the constant in the register is no longer
15451       // destructed as the destination so it can be hoisted out of a loop.
15452       // Only do this pre-AVX since vpcmp* is no longer destructive.
15453       if (Subtarget.hasAVX())
15454         break;
15455       if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
15456         Op1 = ULEOp1;
15457         Subus = true; Invert = false; Swap = false;
15458       }
15459       break;
15460     }
15461     // Psubus is better than flip-sign because it requires no inversion.
15462     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15463     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15464     }
15465 
15466     if (Subus) {
15467       Opc = X86ISD::SUBUS;
15468       FlipSigns = false;
15469     }
15470   }
15471 
15472   if (Swap)
15473     std::swap(Op0, Op1);
15474 
15475   // Check that the operation in question is available (most are plain SSE2,
15476   // but PCMPGTQ and PCMPEQQ have different requirements).
15477   if (VT == MVT::v2i64) {
15478     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
15479       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
15480 
15481       // First cast everything to the right type.
15482       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15483       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15484 
15485       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15486       // bits of the inputs before performing those operations. The lower
15487       // compare is always unsigned.
15488       SDValue SB;
15489       if (FlipSigns) {
15490         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
15491       } else {
15492         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
15493         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
15494         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
15495       }
15496       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15497       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15498 
15499       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15500       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15501       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15502 
15503       // Create masks for only the low parts/high parts of the 64 bit integers.
15504       static const int MaskHi[] = { 1, 1, 3, 3 };
15505       static const int MaskLo[] = { 0, 0, 2, 2 };
15506       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15507       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15508       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15509 
15510       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15511       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15512 
15513       if (Invert)
15514         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15515 
15516       return DAG.getBitcast(VT, Result);
15517     }
15518 
15519     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
15520       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15521       // pcmpeqd + pshufd + pand.
15522       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
15523 
15524       // First cast everything to the right type.
15525       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15526       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15527 
15528       // Do the compare.
15529       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15530 
15531       // Make sure the lower and upper halves are both all-ones.
15532       static const int Mask[] = { 1, 0, 3, 2 };
15533       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15534       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15535 
15536       if (Invert)
15537         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15538 
15539       return DAG.getBitcast(VT, Result);
15540     }
15541   }
15542 
15543   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15544   // bits of the inputs before performing those operations.
15545   if (FlipSigns) {
15546     MVT EltVT = VT.getVectorElementType();
15547     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
15548                                  VT);
15549     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15550     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15551   }
15552 
15553   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15554 
15555   // If the logical-not of the result is required, perform that now.
15556   if (Invert)
15557     Result = DAG.getNOT(dl, Result, VT);
15558 
15559   if (MinMax)
15560     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15561 
15562   if (Subus)
15563     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15564                          getZeroVector(VT, Subtarget, DAG, dl));
15565 
15566   return Result;
15567 }
15568 
LowerSETCC(SDValue Op,SelectionDAG & DAG) const15569 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15570 
15571   MVT VT = Op.getSimpleValueType();
15572 
15573   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15574 
15575   assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15576          && "SetCC type must be 8-bit or 1-bit integer");
15577   SDValue Op0 = Op.getOperand(0);
15578   SDValue Op1 = Op.getOperand(1);
15579   SDLoc dl(Op);
15580   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15581 
15582   // Optimize to BT if possible.
15583   // Lower (X & (1 << N)) == 0 to BT(X, N).
15584   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15585   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15586   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15587       isNullConstant(Op1) &&
15588       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15589     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
15590       if (VT == MVT::i1) {
15591         NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
15592                                DAG.getValueType(MVT::i1));
15593         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15594       }
15595       return NewSetCC;
15596     }
15597   }
15598 
15599   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15600   // these.
15601   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
15602       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15603 
15604     // If the input is a setcc, then reuse the input setcc or use a new one with
15605     // the inverted condition.
15606     if (Op0.getOpcode() == X86ISD::SETCC) {
15607       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15608       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
15609       if (!Invert)
15610         return Op0;
15611 
15612       CCode = X86::GetOppositeBranchCondition(CCode);
15613       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15614                                   DAG.getConstant(CCode, dl, MVT::i8),
15615                                   Op0.getOperand(1));
15616       if (VT == MVT::i1) {
15617         SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15618                             DAG.getValueType(MVT::i1));
15619         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15620       }
15621       return SetCC;
15622     }
15623   }
15624   if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15625     if (isOneConstant(Op1)) {
15626       ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15627       return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
15628     }
15629     if (!isNullConstant(Op1)) {
15630       SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
15631       return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
15632     }
15633   }
15634 
15635   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15636   unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
15637   if (X86CC == X86::COND_INVALID)
15638     return SDValue();
15639 
15640   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15641   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15642   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15643                               DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
15644   if (VT == MVT::i1) {
15645     SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15646                         DAG.getValueType(MVT::i1));
15647     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15648   }
15649   return SetCC;
15650 }
15651 
LowerSETCCE(SDValue Op,SelectionDAG & DAG) const15652 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
15653   SDValue LHS = Op.getOperand(0);
15654   SDValue RHS = Op.getOperand(1);
15655   SDValue Carry = Op.getOperand(2);
15656   SDValue Cond = Op.getOperand(3);
15657   SDLoc DL(Op);
15658 
15659   assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
15660   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
15661 
15662   assert(Carry.getOpcode() != ISD::CARRY_FALSE);
15663   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15664   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
15665   SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15666                               DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
15667   if (Op.getSimpleValueType() == MVT::i1) {
15668     SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
15669                         DAG.getValueType(MVT::i1));
15670     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
15671   }
15672   return SetCC;
15673 }
15674 
15675 /// Return true if opcode is a X86 logical comparison.
isX86LogicalCmp(SDValue Op)15676 static bool isX86LogicalCmp(SDValue Op) {
15677   unsigned Opc = Op.getNode()->getOpcode();
15678   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15679       Opc == X86ISD::SAHF)
15680     return true;
15681   if (Op.getResNo() == 1 &&
15682       (Opc == X86ISD::ADD ||
15683        Opc == X86ISD::SUB ||
15684        Opc == X86ISD::ADC ||
15685        Opc == X86ISD::SBB ||
15686        Opc == X86ISD::SMUL ||
15687        Opc == X86ISD::UMUL ||
15688        Opc == X86ISD::INC ||
15689        Opc == X86ISD::DEC ||
15690        Opc == X86ISD::OR ||
15691        Opc == X86ISD::XOR ||
15692        Opc == X86ISD::AND))
15693     return true;
15694 
15695   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15696     return true;
15697 
15698   return false;
15699 }
15700 
15701 /// Returns the "condition" node, that may be wrapped with "truncate".
15702 /// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
getCondAfterTruncWithZeroHighBitsInput(SDValue V,SelectionDAG & DAG)15703 static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15704   if (V.getOpcode() != ISD::TRUNCATE)
15705     return V;
15706 
15707   SDValue VOp0 = V.getOperand(0);
15708   if (VOp0.getOpcode() == ISD::AssertZext &&
15709       V.getValueSizeInBits() ==
15710       cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
15711     return VOp0.getOperand(0);
15712 
15713   unsigned InBits = VOp0.getValueSizeInBits();
15714   unsigned Bits = V.getValueSizeInBits();
15715   if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
15716     return V.getOperand(0);
15717   return V;
15718 }
15719 
LowerSELECT(SDValue Op,SelectionDAG & DAG) const15720 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15721   bool addTest = true;
15722   SDValue Cond  = Op.getOperand(0);
15723   SDValue Op1 = Op.getOperand(1);
15724   SDValue Op2 = Op.getOperand(2);
15725   SDLoc DL(Op);
15726   MVT VT = Op1.getSimpleValueType();
15727   SDValue CC;
15728 
15729   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15730   // are available or VBLENDV if AVX is available.
15731   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
15732   if (Cond.getOpcode() == ISD::SETCC &&
15733       ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15734        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
15735       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
15736     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15737     int SSECC = translateX86FSETCC(
15738         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15739 
15740     if (SSECC != 8) {
15741       if (Subtarget.hasAVX512()) {
15742         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15743                                   DAG.getConstant(SSECC, DL, MVT::i8));
15744         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15745       }
15746 
15747       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15748                                 DAG.getConstant(SSECC, DL, MVT::i8));
15749 
15750       // If we have AVX, we can use a variable vector select (VBLENDV) instead
15751       // of 3 logic instructions for size savings and potentially speed.
15752       // Unfortunately, there is no scalar form of VBLENDV.
15753 
15754       // If either operand is a constant, don't try this. We can expect to
15755       // optimize away at least one of the logic instructions later in that
15756       // case, so that sequence would be faster than a variable blend.
15757 
15758       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
15759       // uses XMM0 as the selection register. That may need just as many
15760       // instructions as the AND/ANDN/OR sequence due to register moves, so
15761       // don't bother.
15762 
15763       if (Subtarget.hasAVX() &&
15764           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
15765 
15766         // Convert to vectors, do a VSELECT, and convert back to scalar.
15767         // All of the conversions should be optimized away.
15768 
15769         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
15770         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
15771         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
15772         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
15773 
15774         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
15775         VCmp = DAG.getBitcast(VCmpVT, VCmp);
15776 
15777         SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
15778 
15779         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
15780                            VSel, DAG.getIntPtrConstant(0, DL));
15781       }
15782       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15783       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15784       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15785     }
15786   }
15787 
15788   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
15789     SDValue Op1Scalar;
15790     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
15791       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
15792     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
15793       Op1Scalar = Op1.getOperand(0);
15794     SDValue Op2Scalar;
15795     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
15796       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
15797     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
15798       Op2Scalar = Op2.getOperand(0);
15799     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
15800       SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
15801                                       Op1Scalar.getValueType(),
15802                                       Cond, Op1Scalar, Op2Scalar);
15803       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
15804         return DAG.getBitcast(VT, newSelect);
15805       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
15806       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
15807                          DAG.getIntPtrConstant(0, DL));
15808     }
15809   }
15810 
15811   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
15812     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
15813     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15814                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
15815     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15816                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
15817     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
15818                                     Cond, Op1, Op2);
15819     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
15820   }
15821 
15822   if (Cond.getOpcode() == ISD::SETCC) {
15823     if (SDValue NewCond = LowerSETCC(Cond, DAG))
15824       Cond = NewCond;
15825   }
15826 
15827   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15828   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15829   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15830   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15831   if (Cond.getOpcode() == X86ISD::SETCC &&
15832       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15833       isNullConstant(Cond.getOperand(1).getOperand(1))) {
15834     SDValue Cmp = Cond.getOperand(1);
15835 
15836     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15837 
15838     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15839         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15840       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
15841 
15842       SDValue CmpOp0 = Cmp.getOperand(0);
15843       // Apply further optimizations for special cases
15844       // (select (x != 0), -1, 0) -> neg & sbb
15845       // (select (x == 0), 0, -1) -> neg & sbb
15846       if (isNullConstant(Y) &&
15847             (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
15848           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15849           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15850                                     DAG.getConstant(0, DL,
15851                                                     CmpOp0.getValueType()),
15852                                     CmpOp0);
15853           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15854                                     DAG.getConstant(X86::COND_B, DL, MVT::i8),
15855                                     SDValue(Neg.getNode(), 1));
15856           return Res;
15857         }
15858 
15859       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15860                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
15861       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15862 
15863       SDValue Res =   // Res = 0 or -1.
15864         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15865                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
15866 
15867       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
15868         Res = DAG.getNOT(DL, Res, Res.getValueType());
15869 
15870       if (!isNullConstant(Op2))
15871         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15872       return Res;
15873     }
15874   }
15875 
15876   // Look past (and (setcc_carry (cmp ...)), 1).
15877   if (Cond.getOpcode() == ISD::AND &&
15878       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
15879       isOneConstant(Cond.getOperand(1)))
15880     Cond = Cond.getOperand(0);
15881 
15882   // If condition flag is set by a X86ISD::CMP, then use it as the condition
15883   // setting operand in place of the X86ISD::SETCC.
15884   unsigned CondOpcode = Cond.getOpcode();
15885   if (CondOpcode == X86ISD::SETCC ||
15886       CondOpcode == X86ISD::SETCC_CARRY) {
15887     CC = Cond.getOperand(0);
15888 
15889     SDValue Cmp = Cond.getOperand(1);
15890     unsigned Opc = Cmp.getOpcode();
15891     MVT VT = Op.getSimpleValueType();
15892 
15893     bool IllegalFPCMov = false;
15894     if (VT.isFloatingPoint() && !VT.isVector() &&
15895         !isScalarFPTypeInSSEReg(VT))  // FPStack?
15896       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15897 
15898     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15899         Opc == X86ISD::BT) { // FIXME
15900       Cond = Cmp;
15901       addTest = false;
15902     }
15903   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15904              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15905              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15906               Cond.getOperand(0).getValueType() != MVT::i8)) {
15907     SDValue LHS = Cond.getOperand(0);
15908     SDValue RHS = Cond.getOperand(1);
15909     unsigned X86Opcode;
15910     unsigned X86Cond;
15911     SDVTList VTs;
15912     switch (CondOpcode) {
15913     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15914     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15915     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15916     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15917     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15918     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15919     default: llvm_unreachable("unexpected overflowing operator");
15920     }
15921     if (CondOpcode == ISD::UMULO)
15922       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15923                           MVT::i32);
15924     else
15925       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15926 
15927     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15928 
15929     if (CondOpcode == ISD::UMULO)
15930       Cond = X86Op.getValue(2);
15931     else
15932       Cond = X86Op.getValue(1);
15933 
15934     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
15935     addTest = false;
15936   }
15937 
15938   if (addTest) {
15939     // Look past the truncate if the high bits are known zero.
15940     Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
15941 
15942     // We know the result of AND is compared against zero. Try to match
15943     // it to BT.
15944     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15945       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
15946         CC = NewSetCC.getOperand(0);
15947         Cond = NewSetCC.getOperand(1);
15948         addTest = false;
15949       }
15950     }
15951   }
15952 
15953   if (addTest) {
15954     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
15955     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15956   }
15957 
15958   // a <  b ? -1 :  0 -> RES = ~setcc_carry
15959   // a <  b ?  0 : -1 -> RES = setcc_carry
15960   // a >= b ? -1 :  0 -> RES = setcc_carry
15961   // a >= b ?  0 : -1 -> RES = ~setcc_carry
15962   if (Cond.getOpcode() == X86ISD::SUB) {
15963     Cond = ConvertCmpIfNecessary(Cond, DAG);
15964     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15965 
15966     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15967         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15968         (isNullConstant(Op1) || isNullConstant(Op2))) {
15969       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15970                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
15971                                 Cond);
15972       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
15973         return DAG.getNOT(DL, Res, Res.getValueType());
15974       return Res;
15975     }
15976   }
15977 
15978   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15979   // widen the cmov and push the truncate through. This avoids introducing a new
15980   // branch during isel and doesn't add any extensions.
15981   if (Op.getValueType() == MVT::i8 &&
15982       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15983     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15984     if (T1.getValueType() == T2.getValueType() &&
15985         // Blacklist CopyFromReg to avoid partial register stalls.
15986         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15987       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15988       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15989       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15990     }
15991   }
15992 
15993   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15994   // condition is true.
15995   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15996   SDValue Ops[] = { Op2, Op1, CC, Cond };
15997   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15998 }
15999 
LowerSIGN_EXTEND_AVX512(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16000 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
16001                                        const X86Subtarget &Subtarget,
16002                                        SelectionDAG &DAG) {
16003   MVT VT = Op->getSimpleValueType(0);
16004   SDValue In = Op->getOperand(0);
16005   MVT InVT = In.getSimpleValueType();
16006   MVT VTElt = VT.getVectorElementType();
16007   MVT InVTElt = InVT.getVectorElementType();
16008   SDLoc dl(Op);
16009 
16010   // SKX processor
16011   if ((InVTElt == MVT::i1) &&
16012       (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
16013         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16014 
16015        ((Subtarget.hasBWI() && VT.is512BitVector() &&
16016         VTElt.getSizeInBits() <= 16)) ||
16017 
16018        ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
16019         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16020 
16021        ((Subtarget.hasDQI() && VT.is512BitVector() &&
16022         VTElt.getSizeInBits() >= 32))))
16023     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16024 
16025   unsigned int NumElts = VT.getVectorNumElements();
16026 
16027   if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
16028     return SDValue();
16029 
16030   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16031     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16032       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16033     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16034   }
16035 
16036   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16037   MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
16038   SDValue NegOne =
16039    DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
16040                    ExtVT);
16041   SDValue Zero =
16042    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
16043 
16044   SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
16045   if (VT.is512BitVector())
16046     return V;
16047   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
16048 }
16049 
LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16050 static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
16051                                              const X86Subtarget &Subtarget,
16052                                              SelectionDAG &DAG) {
16053   SDValue In = Op->getOperand(0);
16054   MVT VT = Op->getSimpleValueType(0);
16055   MVT InVT = In.getSimpleValueType();
16056   assert(VT.getSizeInBits() == InVT.getSizeInBits());
16057 
16058   MVT SVT = VT.getVectorElementType();
16059   MVT InSVT = InVT.getVectorElementType();
16060   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
16061 
16062   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
16063     return SDValue();
16064   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
16065     return SDValue();
16066   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
16067       !(VT.is256BitVector() && Subtarget.hasInt256()))
16068     return SDValue();
16069 
16070   SDLoc dl(Op);
16071 
16072   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
16073   if (VT.is256BitVector())
16074     In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
16075                      MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
16076                      In, DAG.getIntPtrConstant(0, dl));
16077 
16078   // SSE41 targets can use the pmovsx* instructions directly.
16079   if (Subtarget.hasSSE41())
16080     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16081 
16082   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
16083   SDValue Curr = In;
16084   MVT CurrVT = InVT;
16085 
16086   // As SRAI is only available on i16/i32 types, we expand only up to i32
16087   // and handle i64 separately.
16088   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
16089     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
16090     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
16091     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
16092     Curr = DAG.getBitcast(CurrVT, Curr);
16093   }
16094 
16095   SDValue SignExt = Curr;
16096   if (CurrVT != InVT) {
16097     unsigned SignExtShift =
16098         CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
16099     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16100                           DAG.getConstant(SignExtShift, dl, MVT::i8));
16101   }
16102 
16103   if (CurrVT == VT)
16104     return SignExt;
16105 
16106   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
16107     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16108                                DAG.getConstant(31, dl, MVT::i8));
16109     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
16110     return DAG.getBitcast(VT, Ext);
16111   }
16112 
16113   return SDValue();
16114 }
16115 
LowerSIGN_EXTEND(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16116 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16117                                 SelectionDAG &DAG) {
16118   MVT VT = Op->getSimpleValueType(0);
16119   SDValue In = Op->getOperand(0);
16120   MVT InVT = In.getSimpleValueType();
16121   SDLoc dl(Op);
16122 
16123   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16124     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16125 
16126   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16127       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16128       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16129     return SDValue();
16130 
16131   if (Subtarget.hasInt256())
16132     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16133 
16134   // Optimize vectors in AVX mode
16135   // Sign extend  v8i16 to v8i32 and
16136   //              v4i32 to v4i64
16137   //
16138   // Divide input vector into two parts
16139   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16140   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16141   // concat the vectors to original VT
16142 
16143   unsigned NumElems = InVT.getVectorNumElements();
16144   SDValue Undef = DAG.getUNDEF(InVT);
16145 
16146   SmallVector<int,8> ShufMask1(NumElems, -1);
16147   for (unsigned i = 0; i != NumElems/2; ++i)
16148     ShufMask1[i] = i;
16149 
16150   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
16151 
16152   SmallVector<int,8> ShufMask2(NumElems, -1);
16153   for (unsigned i = 0; i != NumElems/2; ++i)
16154     ShufMask2[i] = i + NumElems/2;
16155 
16156   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
16157 
16158   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
16159                                 VT.getVectorNumElements()/2);
16160 
16161   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16162   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16163 
16164   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16165 }
16166 
16167 // Lower truncating store. We need a special lowering to vXi1 vectors
LowerTruncatingStore(SDValue StOp,const X86Subtarget & Subtarget,SelectionDAG & DAG)16168 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
16169                                     SelectionDAG &DAG) {
16170   StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
16171   SDLoc dl(St);
16172   EVT MemVT = St->getMemoryVT();
16173   assert(St->isTruncatingStore() && "We only custom truncating store.");
16174   assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
16175          "Expected truncstore of i1 vector");
16176 
16177   SDValue Op = St->getValue();
16178   MVT OpVT = Op.getValueType().getSimpleVT();
16179   unsigned NumElts = OpVT.getVectorNumElements();
16180   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16181       NumElts == 16) {
16182     // Truncate and store - everything is legal
16183     Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
16184     if (MemVT.getSizeInBits() < 8)
16185       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
16186                        DAG.getUNDEF(MVT::v8i1), Op,
16187                        DAG.getIntPtrConstant(0, dl));
16188     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16189                         St->getMemOperand());
16190   }
16191 
16192   // A subset, assume that we have only AVX-512F
16193   if (NumElts <= 8) {
16194     if (NumElts < 8) {
16195       // Extend to 8-elts vector
16196       MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
16197       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
16198                         DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
16199     }
16200     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
16201     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16202                         St->getMemOperand());
16203   }
16204   // v32i8
16205   assert(OpVT == MVT::v32i8 && "Unexpected operand type");
16206   // Divide the vector into 2 parts and store each part separately
16207   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16208                             DAG.getIntPtrConstant(0, dl));
16209   Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
16210   SDValue BasePtr = St->getBasePtr();
16211   SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
16212                               St->getMemOperand());
16213   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16214                             DAG.getIntPtrConstant(16, dl));
16215   Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
16216 
16217   SDValue BasePtrHi =
16218     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16219                 DAG.getConstant(2, dl, BasePtr.getValueType()));
16220 
16221   SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
16222                               BasePtrHi, St->getMemOperand());
16223   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
16224 }
16225 
LowerExtended1BitVectorLoad(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16226 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
16227                                            const X86Subtarget &Subtarget,
16228                                            SelectionDAG &DAG) {
16229 
16230   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16231   SDLoc dl(Ld);
16232   EVT MemVT = Ld->getMemoryVT();
16233   assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
16234          "Expected i1 vector load");
16235   unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
16236     ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16237   MVT VT = Op.getValueType().getSimpleVT();
16238   unsigned NumElts = VT.getVectorNumElements();
16239 
16240   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16241       NumElts == 16) {
16242     // Load and extend - everything is legal
16243     if (NumElts < 8) {
16244       SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
16245                                  Ld->getBasePtr(),
16246                                  Ld->getMemOperand());
16247       // Replace chain users with the new chain.
16248       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16249       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16250       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16251       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
16252 
16253       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16254                                    DAG.getIntPtrConstant(0, dl));
16255     }
16256     SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
16257                                Ld->getBasePtr(),
16258                                Ld->getMemOperand());
16259     // Replace chain users with the new chain.
16260     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16261     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16262 
16263     // Finally, do a normal sign-extend to the desired register.
16264     return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
16265   }
16266 
16267   if (NumElts <= 8) {
16268     // A subset, assume that we have only AVX-512F
16269     unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
16270     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
16271     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
16272                               Ld->getBasePtr(),
16273                               Ld->getMemOperand());
16274     // Replace chain users with the new chain.
16275     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16276     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16277 
16278     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
16279     SDValue BitVec = DAG.getBitcast(MaskVT, Load);
16280 
16281     if (NumElts == 8)
16282       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
16283 
16284       // we should take care to v4i1 and v2i1
16285 
16286     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16287     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
16288     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16289                         DAG.getIntPtrConstant(0, dl));
16290   }
16291 
16292   assert(VT == MVT::v32i8 && "Unexpected extload type");
16293 
16294   SmallVector<SDValue, 2> Chains;
16295 
16296   SDValue BasePtr = Ld->getBasePtr();
16297   SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16298                                Ld->getBasePtr(),
16299                                Ld->getMemOperand());
16300   Chains.push_back(LoadLo.getValue(1));
16301 
16302   SDValue BasePtrHi =
16303     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16304                 DAG.getConstant(2, dl, BasePtr.getValueType()));
16305 
16306   SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16307                                BasePtrHi,
16308                                Ld->getMemOperand());
16309   Chains.push_back(LoadHi.getValue(1));
16310   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16311   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
16312 
16313   SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
16314   SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
16315   return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
16316 }
16317 
16318 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16319 // may emit an illegal shuffle but the expansion is still better than scalar
16320 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16321 // we'll emit a shuffle and a arithmetic shift.
16322 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16323 // TODO: It is possible to support ZExt by zeroing the undef values during
16324 // the shuffle phase or after the shuffle.
LowerExtendedLoad(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)16325 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
16326                                  SelectionDAG &DAG) {
16327   MVT RegVT = Op.getSimpleValueType();
16328   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16329   assert(RegVT.isInteger() &&
16330          "We only custom lower integer vector sext loads.");
16331 
16332   // Nothing useful we can do without SSE2 shuffles.
16333   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
16334 
16335   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16336   SDLoc dl(Ld);
16337   EVT MemVT = Ld->getMemoryVT();
16338   if (MemVT.getScalarType() == MVT::i1)
16339     return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
16340 
16341   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16342   unsigned RegSz = RegVT.getSizeInBits();
16343 
16344   ISD::LoadExtType Ext = Ld->getExtensionType();
16345 
16346   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16347          && "Only anyext and sext are currently implemented.");
16348   assert(MemVT != RegVT && "Cannot extend to the same type");
16349   assert(MemVT.isVector() && "Must load a vector from memory");
16350 
16351   unsigned NumElems = RegVT.getVectorNumElements();
16352   unsigned MemSz = MemVT.getSizeInBits();
16353   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16354 
16355   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
16356     // The only way in which we have a legal 256-bit vector result but not the
16357     // integer 256-bit operations needed to directly lower a sextload is if we
16358     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16359     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16360     // correctly legalized. We do this late to allow the canonical form of
16361     // sextload to persist throughout the rest of the DAG combiner -- it wants
16362     // to fold together any extensions it can, and so will fuse a sign_extend
16363     // of an sextload into a sextload targeting a wider value.
16364     SDValue Load;
16365     if (MemSz == 128) {
16366       // Just switch this to a normal load.
16367       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16368                                        "it must be a legal 128-bit vector "
16369                                        "type!");
16370       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16371                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16372                   Ld->isInvariant(), Ld->getAlignment());
16373     } else {
16374       assert(MemSz < 128 &&
16375              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16376       // Do an sext load to a 128-bit vector type. We want to use the same
16377       // number of elements, but elements half as wide. This will end up being
16378       // recursively lowered by this routine, but will succeed as we definitely
16379       // have all the necessary features if we're using AVX1.
16380       EVT HalfEltVT =
16381           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16382       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16383       Load =
16384           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16385                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16386                          Ld->isNonTemporal(), Ld->isInvariant(),
16387                          Ld->getAlignment());
16388     }
16389 
16390     // Replace chain users with the new chain.
16391     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16392     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16393 
16394     // Finally, do a normal sign-extend to the desired register.
16395     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16396   }
16397 
16398   // All sizes must be a power of two.
16399   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16400          "Non-power-of-two elements are not custom lowered!");
16401 
16402   // Attempt to load the original value using scalar loads.
16403   // Find the largest scalar type that divides the total loaded size.
16404   MVT SclrLoadTy = MVT::i8;
16405   for (MVT Tp : MVT::integer_valuetypes()) {
16406     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16407       SclrLoadTy = Tp;
16408     }
16409   }
16410 
16411   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16412   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16413       (64 <= MemSz))
16414     SclrLoadTy = MVT::f64;
16415 
16416   // Calculate the number of scalar loads that we need to perform
16417   // in order to load our vector from memory.
16418   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16419 
16420   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16421          "Can only lower sext loads with a single scalar load!");
16422 
16423   unsigned loadRegZize = RegSz;
16424   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
16425     loadRegZize = 128;
16426 
16427   // Represent our vector as a sequence of elements which are the
16428   // largest scalar that we can load.
16429   EVT LoadUnitVecVT = EVT::getVectorVT(
16430       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16431 
16432   // Represent the data using the same element type that is stored in
16433   // memory. In practice, we ''widen'' MemVT.
16434   EVT WideVecVT =
16435       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16436                        loadRegZize / MemVT.getScalarSizeInBits());
16437 
16438   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16439          "Invalid vector type");
16440 
16441   // We can't shuffle using an illegal type.
16442   assert(TLI.isTypeLegal(WideVecVT) &&
16443          "We only lower types that form legal widened vector types");
16444 
16445   SmallVector<SDValue, 8> Chains;
16446   SDValue Ptr = Ld->getBasePtr();
16447   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
16448                                       TLI.getPointerTy(DAG.getDataLayout()));
16449   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16450 
16451   for (unsigned i = 0; i < NumLoads; ++i) {
16452     // Perform a single load.
16453     SDValue ScalarLoad =
16454         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16455                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16456                     Ld->getAlignment());
16457     Chains.push_back(ScalarLoad.getValue(1));
16458     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16459     // another round of DAGCombining.
16460     if (i == 0)
16461       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16462     else
16463       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16464                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
16465 
16466     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16467   }
16468 
16469   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16470 
16471   // Bitcast the loaded value to a vector of the original element type, in
16472   // the size of the target vector type.
16473   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
16474   unsigned SizeRatio = RegSz / MemSz;
16475 
16476   if (Ext == ISD::SEXTLOAD) {
16477     // If we have SSE4.1, we can directly emit a VSEXT node.
16478     if (Subtarget.hasSSE41()) {
16479       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16480       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16481       return Sext;
16482     }
16483 
16484     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
16485     // lanes.
16486     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
16487            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
16488 
16489     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
16490     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16491     return Shuff;
16492   }
16493 
16494   // Redistribute the loaded elements into the different locations.
16495   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16496   for (unsigned i = 0; i != NumElems; ++i)
16497     ShuffleVec[i * SizeRatio] = i;
16498 
16499   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16500                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
16501 
16502   // Bitcast to the requested type.
16503   Shuff = DAG.getBitcast(RegVT, Shuff);
16504   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16505   return Shuff;
16506 }
16507 
16508 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
16509 /// each of which has no other use apart from the AND / OR.
isAndOrOfSetCCs(SDValue Op,unsigned & Opc)16510 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16511   Opc = Op.getOpcode();
16512   if (Opc != ISD::OR && Opc != ISD::AND)
16513     return false;
16514   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16515           Op.getOperand(0).hasOneUse() &&
16516           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16517           Op.getOperand(1).hasOneUse());
16518 }
16519 
16520 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
16521 /// SETCC node has a single use.
isXor1OfSetCC(SDValue Op)16522 static bool isXor1OfSetCC(SDValue Op) {
16523   if (Op.getOpcode() != ISD::XOR)
16524     return false;
16525   if (isOneConstant(Op.getOperand(1)))
16526     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16527            Op.getOperand(0).hasOneUse();
16528   return false;
16529 }
16530 
LowerBRCOND(SDValue Op,SelectionDAG & DAG) const16531 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16532   bool addTest = true;
16533   SDValue Chain = Op.getOperand(0);
16534   SDValue Cond  = Op.getOperand(1);
16535   SDValue Dest  = Op.getOperand(2);
16536   SDLoc dl(Op);
16537   SDValue CC;
16538   bool Inverted = false;
16539 
16540   if (Cond.getOpcode() == ISD::SETCC) {
16541     // Check for setcc([su]{add,sub,mul}o == 0).
16542     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16543         isNullConstant(Cond.getOperand(1)) &&
16544         Cond.getOperand(0).getResNo() == 1 &&
16545         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16546          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16547          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16548          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16549          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16550          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16551       Inverted = true;
16552       Cond = Cond.getOperand(0);
16553     } else {
16554       if (SDValue NewCond = LowerSETCC(Cond, DAG))
16555         Cond = NewCond;
16556     }
16557   }
16558 #if 0
16559   // FIXME: LowerXALUO doesn't handle these!!
16560   else if (Cond.getOpcode() == X86ISD::ADD  ||
16561            Cond.getOpcode() == X86ISD::SUB  ||
16562            Cond.getOpcode() == X86ISD::SMUL ||
16563            Cond.getOpcode() == X86ISD::UMUL)
16564     Cond = LowerXALUO(Cond, DAG);
16565 #endif
16566 
16567   // Look pass (and (setcc_carry (cmp ...)), 1).
16568   if (Cond.getOpcode() == ISD::AND &&
16569       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
16570       isOneConstant(Cond.getOperand(1)))
16571     Cond = Cond.getOperand(0);
16572 
16573   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16574   // setting operand in place of the X86ISD::SETCC.
16575   unsigned CondOpcode = Cond.getOpcode();
16576   if (CondOpcode == X86ISD::SETCC ||
16577       CondOpcode == X86ISD::SETCC_CARRY) {
16578     CC = Cond.getOperand(0);
16579 
16580     SDValue Cmp = Cond.getOperand(1);
16581     unsigned Opc = Cmp.getOpcode();
16582     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16583     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16584       Cond = Cmp;
16585       addTest = false;
16586     } else {
16587       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16588       default: break;
16589       case X86::COND_O:
16590       case X86::COND_B:
16591         // These can only come from an arithmetic instruction with overflow,
16592         // e.g. SADDO, UADDO.
16593         Cond = Cond.getNode()->getOperand(1);
16594         addTest = false;
16595         break;
16596       }
16597     }
16598   }
16599   CondOpcode = Cond.getOpcode();
16600   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16601       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16602       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16603        Cond.getOperand(0).getValueType() != MVT::i8)) {
16604     SDValue LHS = Cond.getOperand(0);
16605     SDValue RHS = Cond.getOperand(1);
16606     unsigned X86Opcode;
16607     unsigned X86Cond;
16608     SDVTList VTs;
16609     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16610     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16611     // X86ISD::INC).
16612     switch (CondOpcode) {
16613     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16614     case ISD::SADDO:
16615       if (isOneConstant(RHS)) {
16616           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16617           break;
16618         }
16619       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16620     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16621     case ISD::SSUBO:
16622       if (isOneConstant(RHS)) {
16623           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16624           break;
16625         }
16626       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16627     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16628     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16629     default: llvm_unreachable("unexpected overflowing operator");
16630     }
16631     if (Inverted)
16632       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16633     if (CondOpcode == ISD::UMULO)
16634       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16635                           MVT::i32);
16636     else
16637       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16638 
16639     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16640 
16641     if (CondOpcode == ISD::UMULO)
16642       Cond = X86Op.getValue(2);
16643     else
16644       Cond = X86Op.getValue(1);
16645 
16646     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16647     addTest = false;
16648   } else {
16649     unsigned CondOpc;
16650     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16651       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16652       if (CondOpc == ISD::OR) {
16653         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16654         // two branches instead of an explicit OR instruction with a
16655         // separate test.
16656         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16657             isX86LogicalCmp(Cmp)) {
16658           CC = Cond.getOperand(0).getOperand(0);
16659           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16660                               Chain, Dest, CC, Cmp);
16661           CC = Cond.getOperand(1).getOperand(0);
16662           Cond = Cmp;
16663           addTest = false;
16664         }
16665       } else { // ISD::AND
16666         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16667         // two branches instead of an explicit AND instruction with a
16668         // separate test. However, we only do this if this block doesn't
16669         // have a fall-through edge, because this requires an explicit
16670         // jmp when the condition is false.
16671         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16672             isX86LogicalCmp(Cmp) &&
16673             Op.getNode()->hasOneUse()) {
16674           X86::CondCode CCode =
16675             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16676           CCode = X86::GetOppositeBranchCondition(CCode);
16677           CC = DAG.getConstant(CCode, dl, MVT::i8);
16678           SDNode *User = *Op.getNode()->use_begin();
16679           // Look for an unconditional branch following this conditional branch.
16680           // We need this because we need to reverse the successors in order
16681           // to implement FCMP_OEQ.
16682           if (User->getOpcode() == ISD::BR) {
16683             SDValue FalseBB = User->getOperand(1);
16684             SDNode *NewBR =
16685               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16686             assert(NewBR == User);
16687             (void)NewBR;
16688             Dest = FalseBB;
16689 
16690             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16691                                 Chain, Dest, CC, Cmp);
16692             X86::CondCode CCode =
16693               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16694             CCode = X86::GetOppositeBranchCondition(CCode);
16695             CC = DAG.getConstant(CCode, dl, MVT::i8);
16696             Cond = Cmp;
16697             addTest = false;
16698           }
16699         }
16700       }
16701     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16702       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16703       // It should be transformed during dag combiner except when the condition
16704       // is set by a arithmetics with overflow node.
16705       X86::CondCode CCode =
16706         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16707       CCode = X86::GetOppositeBranchCondition(CCode);
16708       CC = DAG.getConstant(CCode, dl, MVT::i8);
16709       Cond = Cond.getOperand(0).getOperand(1);
16710       addTest = false;
16711     } else if (Cond.getOpcode() == ISD::SETCC &&
16712                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16713       // For FCMP_OEQ, we can emit
16714       // two branches instead of an explicit AND instruction with a
16715       // separate test. However, we only do this if this block doesn't
16716       // have a fall-through edge, because this requires an explicit
16717       // jmp when the condition is false.
16718       if (Op.getNode()->hasOneUse()) {
16719         SDNode *User = *Op.getNode()->use_begin();
16720         // Look for an unconditional branch following this conditional branch.
16721         // We need this because we need to reverse the successors in order
16722         // to implement FCMP_OEQ.
16723         if (User->getOpcode() == ISD::BR) {
16724           SDValue FalseBB = User->getOperand(1);
16725           SDNode *NewBR =
16726             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16727           assert(NewBR == User);
16728           (void)NewBR;
16729           Dest = FalseBB;
16730 
16731           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16732                                     Cond.getOperand(0), Cond.getOperand(1));
16733           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16734           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16735           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16736                               Chain, Dest, CC, Cmp);
16737           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
16738           Cond = Cmp;
16739           addTest = false;
16740         }
16741       }
16742     } else if (Cond.getOpcode() == ISD::SETCC &&
16743                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16744       // For FCMP_UNE, we can emit
16745       // two branches instead of an explicit AND instruction with a
16746       // separate test. However, we only do this if this block doesn't
16747       // have a fall-through edge, because this requires an explicit
16748       // jmp when the condition is false.
16749       if (Op.getNode()->hasOneUse()) {
16750         SDNode *User = *Op.getNode()->use_begin();
16751         // Look for an unconditional branch following this conditional branch.
16752         // We need this because we need to reverse the successors in order
16753         // to implement FCMP_UNE.
16754         if (User->getOpcode() == ISD::BR) {
16755           SDValue FalseBB = User->getOperand(1);
16756           SDNode *NewBR =
16757             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16758           assert(NewBR == User);
16759           (void)NewBR;
16760 
16761           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16762                                     Cond.getOperand(0), Cond.getOperand(1));
16763           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16764           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16765           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16766                               Chain, Dest, CC, Cmp);
16767           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
16768           Cond = Cmp;
16769           addTest = false;
16770           Dest = FalseBB;
16771         }
16772       }
16773     }
16774   }
16775 
16776   if (addTest) {
16777     // Look pass the truncate if the high bits are known zero.
16778     Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
16779 
16780     // We know the result of AND is compared against zero. Try to match
16781     // it to BT.
16782     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16783       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
16784         CC = NewSetCC.getOperand(0);
16785         Cond = NewSetCC.getOperand(1);
16786         addTest = false;
16787       }
16788     }
16789   }
16790 
16791   if (addTest) {
16792     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16793     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16794     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16795   }
16796   Cond = ConvertCmpIfNecessary(Cond, DAG);
16797   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16798                      Chain, Dest, CC, Cond);
16799 }
16800 
16801 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16802 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16803 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16804 // that the guard pages used by the OS virtual memory manager are allocated in
16805 // correct sequence.
16806 SDValue
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const16807 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16808                                            SelectionDAG &DAG) const {
16809   MachineFunction &MF = DAG.getMachineFunction();
16810   bool SplitStack = MF.shouldSplitStack();
16811   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
16812                SplitStack;
16813   SDLoc dl(Op);
16814 
16815   // Get the inputs.
16816   SDNode *Node = Op.getNode();
16817   SDValue Chain = Op.getOperand(0);
16818   SDValue Size  = Op.getOperand(1);
16819   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16820   EVT VT = Node->getValueType(0);
16821 
16822   // Chain the dynamic stack allocation so that it doesn't modify the stack
16823   // pointer when other instructions are using the stack.
16824   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
16825 
16826   bool Is64Bit = Subtarget.is64Bit();
16827   MVT SPTy = getPointerTy(DAG.getDataLayout());
16828 
16829   SDValue Result;
16830   if (!Lower) {
16831     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16832     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16833     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16834                     " not tell us which reg is the stack pointer!");
16835 
16836     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16837     Chain = SP.getValue(1);
16838     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
16839     unsigned StackAlign = TFI.getStackAlignment();
16840     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16841     if (Align > StackAlign)
16842       Result = DAG.getNode(ISD::AND, dl, VT, Result,
16843                          DAG.getConstant(-(uint64_t)Align, dl, VT));
16844     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
16845   } else if (SplitStack) {
16846     MachineRegisterInfo &MRI = MF.getRegInfo();
16847 
16848     if (Is64Bit) {
16849       // The 64 bit implementation of segmented stacks needs to clobber both r10
16850       // r11. This makes it impossible to use it along with nested parameters.
16851       const Function *F = MF.getFunction();
16852       for (const auto &A : F->args()) {
16853         if (A.hasNestAttr())
16854           report_fatal_error("Cannot use segmented stacks with functions that "
16855                              "have nested arguments.");
16856       }
16857     }
16858 
16859     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
16860     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16861     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16862     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16863                                 DAG.getRegister(Vreg, SPTy));
16864   } else {
16865     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16866     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
16867     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
16868 
16869     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
16870     unsigned SPReg = RegInfo->getStackRegister();
16871     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16872     Chain = SP.getValue(1);
16873 
16874     if (Align) {
16875       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16876                        DAG.getConstant(-(uint64_t)Align, dl, VT));
16877       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16878     }
16879 
16880     Result = SP;
16881   }
16882 
16883   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
16884                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
16885 
16886   SDValue Ops[2] = {Result, Chain};
16887   return DAG.getMergeValues(Ops, dl);
16888 }
16889 
LowerVASTART(SDValue Op,SelectionDAG & DAG) const16890 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16891   MachineFunction &MF = DAG.getMachineFunction();
16892   auto PtrVT = getPointerTy(MF.getDataLayout());
16893   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16894 
16895   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16896   SDLoc DL(Op);
16897 
16898   if (!Subtarget.is64Bit() ||
16899       Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
16900     // vastart just stores the address of the VarArgsFrameIndex slot into the
16901     // memory location argument.
16902     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16903     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16904                         MachinePointerInfo(SV), false, false, 0);
16905   }
16906 
16907   // __va_list_tag:
16908   //   gp_offset         (0 - 6 * 8)
16909   //   fp_offset         (48 - 48 + 8 * 16)
16910   //   overflow_arg_area (point to parameters coming in memory).
16911   //   reg_save_area
16912   SmallVector<SDValue, 8> MemOps;
16913   SDValue FIN = Op.getOperand(1);
16914   // Store gp_offset
16915   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16916                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16917                                                DL, MVT::i32),
16918                                FIN, MachinePointerInfo(SV), false, false, 0);
16919   MemOps.push_back(Store);
16920 
16921   // Store fp_offset
16922   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
16923   Store = DAG.getStore(Op.getOperand(0), DL,
16924                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
16925                                        MVT::i32),
16926                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16927   MemOps.push_back(Store);
16928 
16929   // Store ptr to overflow_arg_area
16930   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
16931   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16932   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16933                        MachinePointerInfo(SV, 8),
16934                        false, false, 0);
16935   MemOps.push_back(Store);
16936 
16937   // Store ptr to reg_save_area.
16938   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
16939       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
16940   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
16941   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
16942       SV, Subtarget.isTarget64BitLP64() ? 16 : 12), false, false, 0);
16943   MemOps.push_back(Store);
16944   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16945 }
16946 
LowerVAARG(SDValue Op,SelectionDAG & DAG) const16947 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16948   assert(Subtarget.is64Bit() &&
16949          "LowerVAARG only handles 64-bit va_arg!");
16950   assert(Op.getNode()->getNumOperands() == 4);
16951 
16952   MachineFunction &MF = DAG.getMachineFunction();
16953   if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
16954     // The Win64 ABI uses char* instead of a structure.
16955     return DAG.expandVAArg(Op.getNode());
16956 
16957   SDValue Chain = Op.getOperand(0);
16958   SDValue SrcPtr = Op.getOperand(1);
16959   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16960   unsigned Align = Op.getConstantOperandVal(3);
16961   SDLoc dl(Op);
16962 
16963   EVT ArgVT = Op.getNode()->getValueType(0);
16964   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16965   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
16966   uint8_t ArgMode;
16967 
16968   // Decide which area this value should be read from.
16969   // TODO: Implement the AMD64 ABI in its entirety. This simple
16970   // selection mechanism works only for the basic types.
16971   if (ArgVT == MVT::f80) {
16972     llvm_unreachable("va_arg for f80 not yet implemented");
16973   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16974     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16975   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16976     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16977   } else {
16978     llvm_unreachable("Unhandled argument type in LowerVAARG");
16979   }
16980 
16981   if (ArgMode == 2) {
16982     // Sanity Check: Make sure using fp_offset makes sense.
16983     assert(!Subtarget.useSoftFloat() &&
16984            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
16985            Subtarget.hasSSE1());
16986   }
16987 
16988   // Insert VAARG_64 node into the DAG
16989   // VAARG_64 returns two values: Variable Argument Address, Chain
16990   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
16991                        DAG.getConstant(ArgMode, dl, MVT::i8),
16992                        DAG.getConstant(Align, dl, MVT::i32)};
16993   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
16994   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16995                                           VTs, InstOps, MVT::i64,
16996                                           MachinePointerInfo(SV),
16997                                           /*Align=*/0,
16998                                           /*Volatile=*/false,
16999                                           /*ReadMem=*/true,
17000                                           /*WriteMem=*/true);
17001   Chain = VAARG.getValue(1);
17002 
17003   // Load the next argument and return it
17004   return DAG.getLoad(ArgVT, dl,
17005                      Chain,
17006                      VAARG,
17007                      MachinePointerInfo(),
17008                      false, false, false, 0);
17009 }
17010 
LowerVACOPY(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)17011 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
17012                            SelectionDAG &DAG) {
17013   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
17014   // where a va_list is still an i8*.
17015   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
17016   if (Subtarget.isCallingConvWin64(
17017         DAG.getMachineFunction().getFunction()->getCallingConv()))
17018     // Probably a Win64 va_copy.
17019     return DAG.expandVACopy(Op.getNode());
17020 
17021   SDValue Chain = Op.getOperand(0);
17022   SDValue DstPtr = Op.getOperand(1);
17023   SDValue SrcPtr = Op.getOperand(2);
17024   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17025   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17026   SDLoc DL(Op);
17027 
17028   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17029                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
17030                        false, false,
17031                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17032 }
17033 
17034 /// Handle vector element shifts where the shift amount is a constant.
17035 /// Takes immediate version of shift as input.
getTargetVShiftByConstNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,uint64_t ShiftAmt,SelectionDAG & DAG)17036 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
17037                                           SDValue SrcOp, uint64_t ShiftAmt,
17038                                           SelectionDAG &DAG) {
17039   MVT ElementType = VT.getVectorElementType();
17040 
17041   // Fold this packed shift into its first operand if ShiftAmt is 0.
17042   if (ShiftAmt == 0)
17043     return SrcOp;
17044 
17045   // Check for ShiftAmt >= element width
17046   if (ShiftAmt >= ElementType.getSizeInBits()) {
17047     if (Opc == X86ISD::VSRAI)
17048       ShiftAmt = ElementType.getSizeInBits() - 1;
17049     else
17050       return DAG.getConstant(0, dl, VT);
17051   }
17052 
17053   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17054          && "Unknown target vector shift-by-constant node");
17055 
17056   // Fold this packed vector shift into a build vector if SrcOp is a
17057   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17058   if (VT == SrcOp.getSimpleValueType() &&
17059       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17060     SmallVector<SDValue, 8> Elts;
17061     unsigned NumElts = SrcOp->getNumOperands();
17062     ConstantSDNode *ND;
17063 
17064     switch(Opc) {
17065     default: llvm_unreachable("Unknown opcode!");
17066     case X86ISD::VSHLI:
17067       for (unsigned i=0; i!=NumElts; ++i) {
17068         SDValue CurrentOp = SrcOp->getOperand(i);
17069         if (CurrentOp->isUndef()) {
17070           Elts.push_back(CurrentOp);
17071           continue;
17072         }
17073         ND = cast<ConstantSDNode>(CurrentOp);
17074         const APInt &C = ND->getAPIntValue();
17075         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
17076       }
17077       break;
17078     case X86ISD::VSRLI:
17079       for (unsigned i=0; i!=NumElts; ++i) {
17080         SDValue CurrentOp = SrcOp->getOperand(i);
17081         if (CurrentOp->isUndef()) {
17082           Elts.push_back(CurrentOp);
17083           continue;
17084         }
17085         ND = cast<ConstantSDNode>(CurrentOp);
17086         const APInt &C = ND->getAPIntValue();
17087         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
17088       }
17089       break;
17090     case X86ISD::VSRAI:
17091       for (unsigned i=0; i!=NumElts; ++i) {
17092         SDValue CurrentOp = SrcOp->getOperand(i);
17093         if (CurrentOp->isUndef()) {
17094           Elts.push_back(CurrentOp);
17095           continue;
17096         }
17097         ND = cast<ConstantSDNode>(CurrentOp);
17098         const APInt &C = ND->getAPIntValue();
17099         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
17100       }
17101       break;
17102     }
17103 
17104     return DAG.getBuildVector(VT, dl, Elts);
17105   }
17106 
17107   return DAG.getNode(Opc, dl, VT, SrcOp,
17108                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
17109 }
17110 
17111 /// Handle vector element shifts where the shift amount may or may not be a
17112 /// constant. Takes immediate version of shift as input.
getTargetVShiftNode(unsigned Opc,const SDLoc & dl,MVT VT,SDValue SrcOp,SDValue ShAmt,SelectionDAG & DAG)17113 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
17114                                    SDValue SrcOp, SDValue ShAmt,
17115                                    SelectionDAG &DAG) {
17116   MVT SVT = ShAmt.getSimpleValueType();
17117   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17118 
17119   // Catch shift-by-constant.
17120   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17121     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17122                                       CShAmt->getZExtValue(), DAG);
17123 
17124   // Change opcode to non-immediate version
17125   switch (Opc) {
17126     default: llvm_unreachable("Unknown target vector shift node");
17127     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17128     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17129     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17130   }
17131 
17132   const X86Subtarget &Subtarget =
17133       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17134   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17135       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17136     // Let the shuffle legalizer expand this shift amount node.
17137     SDValue Op0 = ShAmt.getOperand(0);
17138     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17139     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
17140   } else {
17141     // Need to build a vector containing shift amount.
17142     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17143     SmallVector<SDValue, 4> ShOps;
17144     ShOps.push_back(ShAmt);
17145     if (SVT == MVT::i32) {
17146       ShOps.push_back(DAG.getConstant(0, dl, SVT));
17147       ShOps.push_back(DAG.getUNDEF(SVT));
17148     }
17149     ShOps.push_back(DAG.getUNDEF(SVT));
17150 
17151     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17152     ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
17153   }
17154 
17155   // The return type has to be a 128-bit type with the same element
17156   // type as the input type.
17157   MVT EltVT = VT.getVectorElementType();
17158   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17159 
17160   ShAmt = DAG.getBitcast(ShVT, ShAmt);
17161   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17162 }
17163 
17164 /// \brief Return Mask with the necessary casting or extending
17165 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
getMaskNode(SDValue Mask,MVT MaskVT,const X86Subtarget & Subtarget,SelectionDAG & DAG,const SDLoc & dl)17166 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
17167                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
17168                            const SDLoc &dl) {
17169 
17170   if (isAllOnesConstant(Mask))
17171     return DAG.getTargetConstant(1, dl, MaskVT);
17172   if (X86::isZeroNode(Mask))
17173     return DAG.getTargetConstant(0, dl, MaskVT);
17174 
17175   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
17176     // Mask should be extended
17177     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
17178                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
17179   }
17180 
17181   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
17182     if (MaskVT == MVT::v64i1) {
17183       assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
17184       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
17185       SDValue Lo, Hi;
17186       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17187                           DAG.getConstant(0, dl, MVT::i32));
17188       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17189                           DAG.getConstant(1, dl, MVT::i32));
17190 
17191       Lo = DAG.getBitcast(MVT::v32i1, Lo);
17192       Hi = DAG.getBitcast(MVT::v32i1, Hi);
17193 
17194       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
17195     } else {
17196       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
17197       // and bitcast.
17198       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
17199       return DAG.getBitcast(MaskVT,
17200                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
17201     }
17202 
17203   } else {
17204     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17205                                      Mask.getSimpleValueType().getSizeInBits());
17206     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17207     // are extracted by EXTRACT_SUBVECTOR.
17208     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17209                        DAG.getBitcast(BitcastVT, Mask),
17210                        DAG.getIntPtrConstant(0, dl));
17211   }
17212 }
17213 
17214 /// \brief Return (and \p Op, \p Mask) for compare instructions or
17215 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17216 /// necessary casting or extending for \p Mask when lowering masking intrinsics
getVectorMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)17217 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17218                   SDValue PreservedSrc,
17219                   const X86Subtarget &Subtarget,
17220                   SelectionDAG &DAG) {
17221   MVT VT = Op.getSimpleValueType();
17222   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17223   unsigned OpcodeSelect = ISD::VSELECT;
17224   SDLoc dl(Op);
17225 
17226   if (isAllOnesConstant(Mask))
17227     return Op;
17228 
17229   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17230 
17231   switch (Op.getOpcode()) {
17232   default: break;
17233   case X86ISD::PCMPEQM:
17234   case X86ISD::PCMPGTM:
17235   case X86ISD::CMPM:
17236   case X86ISD::CMPMU:
17237     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17238   case X86ISD::VFPCLASS:
17239     case X86ISD::VFPCLASSS:
17240     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
17241   case X86ISD::VTRUNC:
17242   case X86ISD::VTRUNCS:
17243   case X86ISD::VTRUNCUS:
17244   case ISD::FP_TO_FP16:
17245     // We can't use ISD::VSELECT here because it is not always "Legal"
17246     // for the destination type. For example vpmovqb require only AVX512
17247     // and vselect that can operate on byte element type require BWI
17248     OpcodeSelect = X86ISD::SELECT;
17249     break;
17250   }
17251   if (PreservedSrc.isUndef())
17252     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17253   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
17254 }
17255 
17256 /// \brief Creates an SDNode for a predicated scalar operation.
17257 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17258 /// The mask is coming as MVT::i8 and it should be truncated
17259 /// to MVT::i1 while lowering masking intrinsics.
17260 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17261 /// "X86select" instead of "vselect". We just can't create the "vselect" node
17262 /// for a scalar instruction.
getScalarMaskingNode(SDValue Op,SDValue Mask,SDValue PreservedSrc,const X86Subtarget & Subtarget,SelectionDAG & DAG)17263 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17264                                     SDValue PreservedSrc,
17265                                     const X86Subtarget &Subtarget,
17266                                     SelectionDAG &DAG) {
17267   if (isAllOnesConstant(Mask))
17268     return Op;
17269 
17270   MVT VT = Op.getSimpleValueType();
17271   SDLoc dl(Op);
17272   // The mask should be of type MVT::i1
17273   SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17274 
17275   if (Op.getOpcode() == X86ISD::FSETCC)
17276     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
17277   if (Op.getOpcode() == X86ISD::VFPCLASS ||
17278       Op.getOpcode() == X86ISD::VFPCLASSS)
17279     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
17280 
17281   if (PreservedSrc.isUndef())
17282     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17283   return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17284 }
17285 
getSEHRegistrationNodeSize(const Function * Fn)17286 static int getSEHRegistrationNodeSize(const Function *Fn) {
17287   if (!Fn->hasPersonalityFn())
17288     report_fatal_error(
17289         "querying registration node size for function without personality");
17290   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
17291   // WinEHStatePass for the full struct definition.
17292   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
17293   case EHPersonality::MSVC_X86SEH: return 24;
17294   case EHPersonality::MSVC_CXX: return 16;
17295   default: break;
17296   }
17297   report_fatal_error(
17298       "can only recover FP for 32-bit MSVC EH personality functions");
17299 }
17300 
17301 /// When the MSVC runtime transfers control to us, either to an outlined
17302 /// function or when returning to a parent frame after catching an exception, we
17303 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
17304 /// Here's the math:
17305 ///   RegNodeBase = EntryEBP - RegNodeSize
17306 ///   ParentFP = RegNodeBase - ParentFrameOffset
17307 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
17308 /// subtracting the offset (negative on x86) takes us back to the parent FP.
recoverFramePointer(SelectionDAG & DAG,const Function * Fn,SDValue EntryEBP)17309 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
17310                                    SDValue EntryEBP) {
17311   MachineFunction &MF = DAG.getMachineFunction();
17312   SDLoc dl;
17313 
17314   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17315   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
17316 
17317   // It's possible that the parent function no longer has a personality function
17318   // if the exceptional code was optimized away, in which case we just return
17319   // the incoming EBP.
17320   if (!Fn->hasPersonalityFn())
17321     return EntryEBP;
17322 
17323   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
17324   // registration, or the .set_setframe offset.
17325   MCSymbol *OffsetSym =
17326       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
17327           GlobalValue::getRealLinkageName(Fn->getName()));
17328   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
17329   SDValue ParentFrameOffset =
17330       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
17331 
17332   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
17333   // prologue to RBP in the parent function.
17334   const X86Subtarget &Subtarget =
17335       static_cast<const X86Subtarget &>(DAG.getSubtarget());
17336   if (Subtarget.is64Bit())
17337     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
17338 
17339   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
17340   // RegNodeBase = EntryEBP - RegNodeSize
17341   // ParentFP = RegNodeBase - ParentFrameOffset
17342   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
17343                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
17344   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
17345 }
17346 
LowerINTRINSIC_WO_CHAIN(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)17347 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
17348                                        SelectionDAG &DAG) {
17349   SDLoc dl(Op);
17350   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17351   MVT VT = Op.getSimpleValueType();
17352   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17353   if (IntrData) {
17354     switch(IntrData->Type) {
17355     case INTR_TYPE_1OP:
17356       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17357     case INTR_TYPE_2OP:
17358       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17359         Op.getOperand(2));
17360     case INTR_TYPE_2OP_IMM8:
17361       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17362                          DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
17363     case INTR_TYPE_3OP:
17364       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17365         Op.getOperand(2), Op.getOperand(3));
17366     case INTR_TYPE_4OP:
17367       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17368         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
17369     case INTR_TYPE_1OP_MASK_RM: {
17370       SDValue Src = Op.getOperand(1);
17371       SDValue PassThru = Op.getOperand(2);
17372       SDValue Mask = Op.getOperand(3);
17373       SDValue RoundingMode;
17374       // We allways add rounding mode to the Node.
17375       // If the rounding mode is not specified, we add the
17376       // "current direction" mode.
17377       if (Op.getNumOperands() == 4)
17378         RoundingMode =
17379           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17380       else
17381         RoundingMode = Op.getOperand(4);
17382       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17383       if (IntrWithRoundingModeOpcode != 0)
17384         if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
17385             X86::STATIC_ROUNDING::CUR_DIRECTION)
17386           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17387                                       dl, Op.getValueType(), Src, RoundingMode),
17388                                       Mask, PassThru, Subtarget, DAG);
17389       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17390                                               RoundingMode),
17391                                   Mask, PassThru, Subtarget, DAG);
17392     }
17393     case INTR_TYPE_1OP_MASK: {
17394       SDValue Src = Op.getOperand(1);
17395       SDValue PassThru = Op.getOperand(2);
17396       SDValue Mask = Op.getOperand(3);
17397       // We add rounding mode to the Node when
17398       //   - RM Opcode is specified and
17399       //   - RM is not "current direction".
17400       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17401       if (IntrWithRoundingModeOpcode != 0) {
17402         SDValue Rnd = Op.getOperand(4);
17403         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17404         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17405           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17406                                       dl, Op.getValueType(),
17407                                       Src, Rnd),
17408                                       Mask, PassThru, Subtarget, DAG);
17409         }
17410       }
17411       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17412                                   Mask, PassThru, Subtarget, DAG);
17413     }
17414     case INTR_TYPE_SCALAR_MASK: {
17415       SDValue Src1 = Op.getOperand(1);
17416       SDValue Src2 = Op.getOperand(2);
17417       SDValue passThru = Op.getOperand(3);
17418       SDValue Mask = Op.getOperand(4);
17419       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
17420                                   Mask, passThru, Subtarget, DAG);
17421     }
17422     case INTR_TYPE_SCALAR_MASK_RM: {
17423       SDValue Src1 = Op.getOperand(1);
17424       SDValue Src2 = Op.getOperand(2);
17425       SDValue Src0 = Op.getOperand(3);
17426       SDValue Mask = Op.getOperand(4);
17427       // There are 2 kinds of intrinsics in this group:
17428       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
17429       // (2) With rounding mode and sae - 7 operands.
17430       if (Op.getNumOperands() == 6) {
17431         SDValue Sae  = Op.getOperand(5);
17432         unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
17433         return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
17434                                                 Sae),
17435                                     Mask, Src0, Subtarget, DAG);
17436       }
17437       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
17438       SDValue RoundingMode  = Op.getOperand(5);
17439       SDValue Sae  = Op.getOperand(6);
17440       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17441                                               RoundingMode, Sae),
17442                                   Mask, Src0, Subtarget, DAG);
17443     }
17444     case INTR_TYPE_2OP_MASK:
17445     case INTR_TYPE_2OP_IMM8_MASK: {
17446       SDValue Src1 = Op.getOperand(1);
17447       SDValue Src2 = Op.getOperand(2);
17448       SDValue PassThru = Op.getOperand(3);
17449       SDValue Mask = Op.getOperand(4);
17450 
17451       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
17452         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
17453 
17454       // We specify 2 possible opcodes for intrinsics with rounding modes.
17455       // First, we check if the intrinsic may have non-default rounding mode,
17456       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17457       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17458       if (IntrWithRoundingModeOpcode != 0) {
17459         SDValue Rnd = Op.getOperand(5);
17460         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17461         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17462           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17463                                       dl, Op.getValueType(),
17464                                       Src1, Src2, Rnd),
17465                                       Mask, PassThru, Subtarget, DAG);
17466         }
17467       }
17468       // TODO: Intrinsics should have fast-math-flags to propagate.
17469       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
17470                                   Mask, PassThru, Subtarget, DAG);
17471     }
17472     case INTR_TYPE_2OP_MASK_RM: {
17473       SDValue Src1 = Op.getOperand(1);
17474       SDValue Src2 = Op.getOperand(2);
17475       SDValue PassThru = Op.getOperand(3);
17476       SDValue Mask = Op.getOperand(4);
17477       // We specify 2 possible modes for intrinsics, with/without rounding
17478       // modes.
17479       // First, we check if the intrinsic have rounding mode (6 operands),
17480       // if not, we set rounding mode to "current".
17481       SDValue Rnd;
17482       if (Op.getNumOperands() == 6)
17483         Rnd = Op.getOperand(5);
17484       else
17485         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17486       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17487                                               Src1, Src2, Rnd),
17488                                   Mask, PassThru, Subtarget, DAG);
17489     }
17490     case INTR_TYPE_3OP_SCALAR_MASK_RM: {
17491       SDValue Src1 = Op.getOperand(1);
17492       SDValue Src2 = Op.getOperand(2);
17493       SDValue Src3 = Op.getOperand(3);
17494       SDValue PassThru = Op.getOperand(4);
17495       SDValue Mask = Op.getOperand(5);
17496       SDValue Sae  = Op.getOperand(6);
17497 
17498       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
17499                                               Src2, Src3, Sae),
17500                                   Mask, PassThru, Subtarget, DAG);
17501     }
17502     case INTR_TYPE_3OP_MASK_RM: {
17503       SDValue Src1 = Op.getOperand(1);
17504       SDValue Src2 = Op.getOperand(2);
17505       SDValue Imm = Op.getOperand(3);
17506       SDValue PassThru = Op.getOperand(4);
17507       SDValue Mask = Op.getOperand(5);
17508       // We specify 2 possible modes for intrinsics, with/without rounding
17509       // modes.
17510       // First, we check if the intrinsic have rounding mode (7 operands),
17511       // if not, we set rounding mode to "current".
17512       SDValue Rnd;
17513       if (Op.getNumOperands() == 7)
17514         Rnd = Op.getOperand(6);
17515       else
17516         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17517       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17518         Src1, Src2, Imm, Rnd),
17519         Mask, PassThru, Subtarget, DAG);
17520     }
17521     case INTR_TYPE_3OP_IMM8_MASK:
17522     case INTR_TYPE_3OP_MASK:
17523     case INSERT_SUBVEC: {
17524       SDValue Src1 = Op.getOperand(1);
17525       SDValue Src2 = Op.getOperand(2);
17526       SDValue Src3 = Op.getOperand(3);
17527       SDValue PassThru = Op.getOperand(4);
17528       SDValue Mask = Op.getOperand(5);
17529 
17530       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
17531         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
17532       else if (IntrData->Type == INSERT_SUBVEC) {
17533         // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
17534         assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
17535         unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
17536         Imm *= Src2.getSimpleValueType().getVectorNumElements();
17537         Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
17538       }
17539 
17540       // We specify 2 possible opcodes for intrinsics with rounding modes.
17541       // First, we check if the intrinsic may have non-default rounding mode,
17542       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17543       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17544       if (IntrWithRoundingModeOpcode != 0) {
17545         SDValue Rnd = Op.getOperand(6);
17546         unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17547         if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17548           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17549                                       dl, Op.getValueType(),
17550                                       Src1, Src2, Src3, Rnd),
17551                                       Mask, PassThru, Subtarget, DAG);
17552         }
17553       }
17554       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17555                                               Src1, Src2, Src3),
17556                                   Mask, PassThru, Subtarget, DAG);
17557     }
17558     case VPERM_2OP_MASK : {
17559       SDValue Src1 = Op.getOperand(1);
17560       SDValue Src2 = Op.getOperand(2);
17561       SDValue PassThru = Op.getOperand(3);
17562       SDValue Mask = Op.getOperand(4);
17563 
17564       // Swap Src1 and Src2 in the node creation
17565       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
17566                                   Mask, PassThru, Subtarget, DAG);
17567     }
17568     case VPERM_3OP_MASKZ:
17569     case VPERM_3OP_MASK:{
17570       // Src2 is the PassThru
17571       SDValue Src1 = Op.getOperand(1);
17572       SDValue Src2 = Op.getOperand(2);
17573       SDValue Src3 = Op.getOperand(3);
17574       SDValue Mask = Op.getOperand(4);
17575       MVT VT = Op.getSimpleValueType();
17576       SDValue PassThru = SDValue();
17577 
17578       // set PassThru element
17579       if (IntrData->Type == VPERM_3OP_MASKZ)
17580         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17581       else
17582         PassThru = DAG.getBitcast(VT, Src2);
17583 
17584       // Swap Src1 and Src2 in the node creation
17585       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17586                                               dl, Op.getValueType(),
17587                                               Src2, Src1, Src3),
17588                                   Mask, PassThru, Subtarget, DAG);
17589     }
17590     case FMA_OP_MASK3:
17591     case FMA_OP_MASKZ:
17592     case FMA_OP_MASK: {
17593       SDValue Src1 = Op.getOperand(1);
17594       SDValue Src2 = Op.getOperand(2);
17595       SDValue Src3 = Op.getOperand(3);
17596       SDValue Mask = Op.getOperand(4);
17597       MVT VT = Op.getSimpleValueType();
17598       SDValue PassThru = SDValue();
17599 
17600       // set PassThru element
17601       if (IntrData->Type == FMA_OP_MASKZ)
17602         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17603       else if (IntrData->Type == FMA_OP_MASK3)
17604         PassThru = Src3;
17605       else
17606         PassThru = Src1;
17607 
17608       // We specify 2 possible opcodes for intrinsics with rounding modes.
17609       // First, we check if the intrinsic may have non-default rounding mode,
17610       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17611       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17612       if (IntrWithRoundingModeOpcode != 0) {
17613         SDValue Rnd = Op.getOperand(5);
17614         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17615             X86::STATIC_ROUNDING::CUR_DIRECTION)
17616           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17617                                                   dl, Op.getValueType(),
17618                                                   Src1, Src2, Src3, Rnd),
17619                                       Mask, PassThru, Subtarget, DAG);
17620       }
17621       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17622                                               dl, Op.getValueType(),
17623                                               Src1, Src2, Src3),
17624                                   Mask, PassThru, Subtarget, DAG);
17625     }
17626     case FMA_OP_SCALAR_MASK:
17627     case FMA_OP_SCALAR_MASK3:
17628     case FMA_OP_SCALAR_MASKZ: {
17629       SDValue Src1 = Op.getOperand(1);
17630       SDValue Src2 = Op.getOperand(2);
17631       SDValue Src3 = Op.getOperand(3);
17632       SDValue Mask = Op.getOperand(4);
17633       MVT VT = Op.getSimpleValueType();
17634       SDValue PassThru = SDValue();
17635 
17636       // set PassThru element
17637       if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
17638         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17639       else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
17640         PassThru = Src3;
17641       else
17642         PassThru = Src1;
17643 
17644       SDValue Rnd = Op.getOperand(5);
17645       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
17646                                               Op.getValueType(), Src1, Src2,
17647                                               Src3, Rnd),
17648                                   Mask, PassThru, Subtarget, DAG);
17649     }
17650     case TERLOG_OP_MASK:
17651     case TERLOG_OP_MASKZ: {
17652       SDValue Src1 = Op.getOperand(1);
17653       SDValue Src2 = Op.getOperand(2);
17654       SDValue Src3 = Op.getOperand(3);
17655       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
17656       SDValue Mask = Op.getOperand(5);
17657       MVT VT = Op.getSimpleValueType();
17658       SDValue PassThru = Src1;
17659       // Set PassThru element.
17660       if (IntrData->Type == TERLOG_OP_MASKZ)
17661         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17662 
17663       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17664                                               Src1, Src2, Src3, Src4),
17665                                   Mask, PassThru, Subtarget, DAG);
17666     }
17667     case FPCLASS: {
17668       // FPclass intrinsics with mask
17669        SDValue Src1 = Op.getOperand(1);
17670        MVT VT = Src1.getSimpleValueType();
17671        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17672        SDValue Imm = Op.getOperand(2);
17673        SDValue Mask = Op.getOperand(3);
17674        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17675                                      Mask.getSimpleValueType().getSizeInBits());
17676        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
17677        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
17678                                                  DAG.getTargetConstant(0, dl, MaskVT),
17679                                                  Subtarget, DAG);
17680        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17681                                  DAG.getUNDEF(BitcastVT), FPclassMask,
17682                                  DAG.getIntPtrConstant(0, dl));
17683        return DAG.getBitcast(Op.getValueType(), Res);
17684     }
17685     case FPCLASSS: {
17686       SDValue Src1 = Op.getOperand(1);
17687       SDValue Imm = Op.getOperand(2);
17688       SDValue Mask = Op.getOperand(3);
17689       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
17690       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
17691         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
17692       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
17693     }
17694     case CMP_MASK:
17695     case CMP_MASK_CC: {
17696       // Comparison intrinsics with masks.
17697       // Example of transformation:
17698       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17699       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17700       // (i8 (bitcast
17701       //   (v8i1 (insert_subvector undef,
17702       //           (v2i1 (and (PCMPEQM %a, %b),
17703       //                      (extract_subvector
17704       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17705       MVT VT = Op.getOperand(1).getSimpleValueType();
17706       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17707       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17708       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17709                                        Mask.getSimpleValueType().getSizeInBits());
17710       SDValue Cmp;
17711       if (IntrData->Type == CMP_MASK_CC) {
17712         SDValue CC = Op.getOperand(3);
17713         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
17714         // We specify 2 possible opcodes for intrinsics with rounding modes.
17715         // First, we check if the intrinsic may have non-default rounding mode,
17716         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17717         if (IntrData->Opc1 != 0) {
17718           SDValue Rnd = Op.getOperand(5);
17719           if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17720               X86::STATIC_ROUNDING::CUR_DIRECTION)
17721             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
17722                               Op.getOperand(2), CC, Rnd);
17723         }
17724         //default rounding mode
17725         if(!Cmp.getNode())
17726             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17727                               Op.getOperand(2), CC);
17728 
17729       } else {
17730         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17731         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17732                           Op.getOperand(2));
17733       }
17734       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17735                                              DAG.getTargetConstant(0, dl,
17736                                                                    MaskVT),
17737                                              Subtarget, DAG);
17738       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17739                                 DAG.getUNDEF(BitcastVT), CmpMask,
17740                                 DAG.getIntPtrConstant(0, dl));
17741       return DAG.getBitcast(Op.getValueType(), Res);
17742     }
17743     case CMP_MASK_SCALAR_CC: {
17744       SDValue Src1 = Op.getOperand(1);
17745       SDValue Src2 = Op.getOperand(2);
17746       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
17747       SDValue Mask = Op.getOperand(4);
17748 
17749       SDValue Cmp;
17750       if (IntrData->Opc1 != 0) {
17751         SDValue Rnd = Op.getOperand(5);
17752         if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17753             X86::STATIC_ROUNDING::CUR_DIRECTION)
17754           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
17755       }
17756       //default rounding mode
17757       if(!Cmp.getNode())
17758         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
17759 
17760       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
17761                                              DAG.getTargetConstant(0, dl,
17762                                                                    MVT::i1),
17763                                              Subtarget, DAG);
17764 
17765       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
17766     }
17767     case COMI: { // Comparison intrinsics
17768       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17769       SDValue LHS = Op.getOperand(1);
17770       SDValue RHS = Op.getOperand(2);
17771       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17772       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
17773       SDValue SetCC;
17774       switch (CC) {
17775       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
17776         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17777                             DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
17778         SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17779                                     DAG.getConstant(X86::COND_NP, dl, MVT::i8),
17780                                     Comi);
17781         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
17782         break;
17783       }
17784       case ISD::SETNE: { // (ZF = 1 or PF = 1)
17785         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17786                             DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
17787         SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17788                                    DAG.getConstant(X86::COND_P, dl, MVT::i8),
17789                                    Comi);
17790         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
17791         break;
17792       }
17793       case ISD::SETGT: // (CF = 0 and ZF = 0)
17794         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17795                             DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
17796         break;
17797       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
17798         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17799                             DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
17800         break;
17801       }
17802       case ISD::SETGE: // CF = 0
17803         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17804                             DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
17805         break;
17806       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
17807         SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17808                             DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
17809         break;
17810       default:
17811         llvm_unreachable("Unexpected illegal condition!");
17812       }
17813       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17814     }
17815     case COMI_RM: { // Comparison intrinsics with Sae
17816       SDValue LHS = Op.getOperand(1);
17817       SDValue RHS = Op.getOperand(2);
17818       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
17819       SDValue Sae = Op.getOperand(4);
17820 
17821       SDValue FCmp;
17822       if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
17823           X86::STATIC_ROUNDING::CUR_DIRECTION)
17824         FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17825                                   DAG.getConstant(CondVal, dl, MVT::i8));
17826       else
17827         FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17828                                   DAG.getConstant(CondVal, dl, MVT::i8), Sae);
17829       // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
17830       return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
17831     }
17832     case VSHIFT:
17833       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17834                                  Op.getOperand(1), Op.getOperand(2), DAG);
17835     case COMPRESS_EXPAND_IN_REG: {
17836       SDValue Mask = Op.getOperand(3);
17837       SDValue DataToCompress = Op.getOperand(1);
17838       SDValue PassThru = Op.getOperand(2);
17839       if (isAllOnesConstant(Mask)) // return data as is
17840         return Op.getOperand(1);
17841 
17842       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17843                                               DataToCompress),
17844                                   Mask, PassThru, Subtarget, DAG);
17845     }
17846     case BROADCASTM: {
17847       SDValue Mask = Op.getOperand(1);
17848       MVT MaskVT = MVT::getVectorVT(MVT::i1,
17849                                     Mask.getSimpleValueType().getSizeInBits());
17850       Mask = DAG.getBitcast(MaskVT, Mask);
17851       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
17852     }
17853     case KUNPCK: {
17854       MVT VT = Op.getSimpleValueType();
17855       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
17856 
17857       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
17858       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
17859       // Arguments should be swapped.
17860       SDValue Res = DAG.getNode(IntrData->Opc0, dl,
17861                                 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
17862                                 Src2, Src1);
17863       return DAG.getBitcast(VT, Res);
17864     }
17865     case FIXUPIMMS:
17866     case FIXUPIMMS_MASKZ:
17867     case FIXUPIMM:
17868     case FIXUPIMM_MASKZ:{
17869       SDValue Src1 = Op.getOperand(1);
17870       SDValue Src2 = Op.getOperand(2);
17871       SDValue Src3 = Op.getOperand(3);
17872       SDValue Imm = Op.getOperand(4);
17873       SDValue Mask = Op.getOperand(5);
17874       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
17875                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
17876       // We specify 2 possible modes for intrinsics, with/without rounding
17877       // modes.
17878       // First, we check if the intrinsic have rounding mode (7 operands),
17879       // if not, we set rounding mode to "current".
17880       SDValue Rnd;
17881       if (Op.getNumOperands() == 7)
17882         Rnd = Op.getOperand(6);
17883       else
17884         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17885       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
17886         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17887                                                 Src1, Src2, Src3, Imm, Rnd),
17888                                     Mask, Passthru, Subtarget, DAG);
17889       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
17890         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17891                                        Src1, Src2, Src3, Imm, Rnd),
17892                                     Mask, Passthru, Subtarget, DAG);
17893     }
17894     case CONVERT_TO_MASK: {
17895       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
17896       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
17897       MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
17898 
17899       SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
17900                                     Op.getOperand(1));
17901       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17902                                 DAG.getUNDEF(BitcastVT), CvtMask,
17903                                 DAG.getIntPtrConstant(0, dl));
17904       return DAG.getBitcast(Op.getValueType(), Res);
17905     }
17906     case CONVERT_MASK_TO_VEC: {
17907       SDValue Mask = Op.getOperand(1);
17908       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17909       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17910       return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
17911     }
17912     case BRCST_SUBVEC_TO_VEC: {
17913       SDValue Src = Op.getOperand(1);
17914       SDValue Passthru = Op.getOperand(2);
17915       SDValue Mask = Op.getOperand(3);
17916       EVT resVT = Passthru.getValueType();
17917       SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
17918                                        DAG.getUNDEF(resVT), Src,
17919                                        DAG.getIntPtrConstant(0, dl));
17920       SDValue immVal;
17921       if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
17922         immVal = DAG.getConstant(0x44, dl, MVT::i8);
17923       else
17924         immVal = DAG.getConstant(0, dl, MVT::i8);
17925       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17926                                               subVec, subVec, immVal),
17927                                   Mask, Passthru, Subtarget, DAG);
17928     }
17929     case BRCST32x2_TO_VEC: {
17930       SDValue Src = Op.getOperand(1);
17931       SDValue PassThru = Op.getOperand(2);
17932       SDValue Mask = Op.getOperand(3);
17933 
17934       assert((VT.getScalarType() == MVT::i32 ||
17935               VT.getScalarType() == MVT::f32) && "Unexpected type!");
17936       //bitcast Src to packed 64
17937       MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
17938       MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
17939       Src = DAG.getBitcast(BitcastVT, Src);
17940 
17941       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17942                                   Mask, PassThru, Subtarget, DAG);
17943     }
17944     default:
17945       break;
17946     }
17947   }
17948 
17949   switch (IntNo) {
17950   default: return SDValue();    // Don't custom lower most intrinsics.
17951 
17952   case Intrinsic::x86_avx2_permd:
17953   case Intrinsic::x86_avx2_permps:
17954     // Operands intentionally swapped. Mask is last operand to intrinsic,
17955     // but second operand for node/instruction.
17956     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
17957                        Op.getOperand(2), Op.getOperand(1));
17958 
17959   // ptest and testp intrinsics. The intrinsic these come from are designed to
17960   // return an integer value, not just an instruction so lower it to the ptest
17961   // or testp pattern and a setcc for the result.
17962   case Intrinsic::x86_sse41_ptestz:
17963   case Intrinsic::x86_sse41_ptestc:
17964   case Intrinsic::x86_sse41_ptestnzc:
17965   case Intrinsic::x86_avx_ptestz_256:
17966   case Intrinsic::x86_avx_ptestc_256:
17967   case Intrinsic::x86_avx_ptestnzc_256:
17968   case Intrinsic::x86_avx_vtestz_ps:
17969   case Intrinsic::x86_avx_vtestc_ps:
17970   case Intrinsic::x86_avx_vtestnzc_ps:
17971   case Intrinsic::x86_avx_vtestz_pd:
17972   case Intrinsic::x86_avx_vtestc_pd:
17973   case Intrinsic::x86_avx_vtestnzc_pd:
17974   case Intrinsic::x86_avx_vtestz_ps_256:
17975   case Intrinsic::x86_avx_vtestc_ps_256:
17976   case Intrinsic::x86_avx_vtestnzc_ps_256:
17977   case Intrinsic::x86_avx_vtestz_pd_256:
17978   case Intrinsic::x86_avx_vtestc_pd_256:
17979   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17980     bool IsTestPacked = false;
17981     unsigned X86CC;
17982     switch (IntNo) {
17983     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17984     case Intrinsic::x86_avx_vtestz_ps:
17985     case Intrinsic::x86_avx_vtestz_pd:
17986     case Intrinsic::x86_avx_vtestz_ps_256:
17987     case Intrinsic::x86_avx_vtestz_pd_256:
17988       IsTestPacked = true; // Fallthrough
17989     case Intrinsic::x86_sse41_ptestz:
17990     case Intrinsic::x86_avx_ptestz_256:
17991       // ZF = 1
17992       X86CC = X86::COND_E;
17993       break;
17994     case Intrinsic::x86_avx_vtestc_ps:
17995     case Intrinsic::x86_avx_vtestc_pd:
17996     case Intrinsic::x86_avx_vtestc_ps_256:
17997     case Intrinsic::x86_avx_vtestc_pd_256:
17998       IsTestPacked = true; // Fallthrough
17999     case Intrinsic::x86_sse41_ptestc:
18000     case Intrinsic::x86_avx_ptestc_256:
18001       // CF = 1
18002       X86CC = X86::COND_B;
18003       break;
18004     case Intrinsic::x86_avx_vtestnzc_ps:
18005     case Intrinsic::x86_avx_vtestnzc_pd:
18006     case Intrinsic::x86_avx_vtestnzc_ps_256:
18007     case Intrinsic::x86_avx_vtestnzc_pd_256:
18008       IsTestPacked = true; // Fallthrough
18009     case Intrinsic::x86_sse41_ptestnzc:
18010     case Intrinsic::x86_avx_ptestnzc_256:
18011       // ZF and CF = 0
18012       X86CC = X86::COND_A;
18013       break;
18014     }
18015 
18016     SDValue LHS = Op.getOperand(1);
18017     SDValue RHS = Op.getOperand(2);
18018     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
18019     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
18020     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18021     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18022     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18023   }
18024   case Intrinsic::x86_avx512_kortestz_w:
18025   case Intrinsic::x86_avx512_kortestc_w: {
18026     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
18027     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
18028     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
18029     SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18030     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18031     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18032     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18033   }
18034 
18035   case Intrinsic::x86_sse42_pcmpistria128:
18036   case Intrinsic::x86_sse42_pcmpestria128:
18037   case Intrinsic::x86_sse42_pcmpistric128:
18038   case Intrinsic::x86_sse42_pcmpestric128:
18039   case Intrinsic::x86_sse42_pcmpistrio128:
18040   case Intrinsic::x86_sse42_pcmpestrio128:
18041   case Intrinsic::x86_sse42_pcmpistris128:
18042   case Intrinsic::x86_sse42_pcmpestris128:
18043   case Intrinsic::x86_sse42_pcmpistriz128:
18044   case Intrinsic::x86_sse42_pcmpestriz128: {
18045     unsigned Opcode;
18046     unsigned X86CC;
18047     switch (IntNo) {
18048     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
18049     case Intrinsic::x86_sse42_pcmpistria128:
18050       Opcode = X86ISD::PCMPISTRI;
18051       X86CC = X86::COND_A;
18052       break;
18053     case Intrinsic::x86_sse42_pcmpestria128:
18054       Opcode = X86ISD::PCMPESTRI;
18055       X86CC = X86::COND_A;
18056       break;
18057     case Intrinsic::x86_sse42_pcmpistric128:
18058       Opcode = X86ISD::PCMPISTRI;
18059       X86CC = X86::COND_B;
18060       break;
18061     case Intrinsic::x86_sse42_pcmpestric128:
18062       Opcode = X86ISD::PCMPESTRI;
18063       X86CC = X86::COND_B;
18064       break;
18065     case Intrinsic::x86_sse42_pcmpistrio128:
18066       Opcode = X86ISD::PCMPISTRI;
18067       X86CC = X86::COND_O;
18068       break;
18069     case Intrinsic::x86_sse42_pcmpestrio128:
18070       Opcode = X86ISD::PCMPESTRI;
18071       X86CC = X86::COND_O;
18072       break;
18073     case Intrinsic::x86_sse42_pcmpistris128:
18074       Opcode = X86ISD::PCMPISTRI;
18075       X86CC = X86::COND_S;
18076       break;
18077     case Intrinsic::x86_sse42_pcmpestris128:
18078       Opcode = X86ISD::PCMPESTRI;
18079       X86CC = X86::COND_S;
18080       break;
18081     case Intrinsic::x86_sse42_pcmpistriz128:
18082       Opcode = X86ISD::PCMPISTRI;
18083       X86CC = X86::COND_E;
18084       break;
18085     case Intrinsic::x86_sse42_pcmpestriz128:
18086       Opcode = X86ISD::PCMPESTRI;
18087       X86CC = X86::COND_E;
18088       break;
18089     }
18090     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18091     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18092     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
18093     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18094                                 DAG.getConstant(X86CC, dl, MVT::i8),
18095                                 SDValue(PCMP.getNode(), 1));
18096     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18097   }
18098 
18099   case Intrinsic::x86_sse42_pcmpistri128:
18100   case Intrinsic::x86_sse42_pcmpestri128: {
18101     unsigned Opcode;
18102     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
18103       Opcode = X86ISD::PCMPISTRI;
18104     else
18105       Opcode = X86ISD::PCMPESTRI;
18106 
18107     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18108     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18109     return DAG.getNode(Opcode, dl, VTs, NewOps);
18110   }
18111 
18112   case Intrinsic::eh_sjlj_lsda: {
18113     MachineFunction &MF = DAG.getMachineFunction();
18114     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18115     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18116     auto &Context = MF.getMMI().getContext();
18117     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
18118                                             Twine(MF.getFunctionNumber()));
18119     return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
18120   }
18121 
18122   case Intrinsic::x86_seh_lsda: {
18123     // Compute the symbol for the LSDA. We know it'll get emitted later.
18124     MachineFunction &MF = DAG.getMachineFunction();
18125     SDValue Op1 = Op.getOperand(1);
18126     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
18127     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
18128         GlobalValue::getRealLinkageName(Fn->getName()));
18129 
18130     // Generate a simple absolute symbol reference. This intrinsic is only
18131     // supported on 32-bit Windows, which isn't PIC.
18132     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
18133     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
18134   }
18135 
18136   case Intrinsic::x86_seh_recoverfp: {
18137     SDValue FnOp = Op.getOperand(1);
18138     SDValue IncomingFPOp = Op.getOperand(2);
18139     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
18140     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
18141     if (!Fn)
18142       report_fatal_error(
18143           "llvm.x86.seh.recoverfp must take a function as the first argument");
18144     return recoverFramePointer(DAG, Fn, IncomingFPOp);
18145   }
18146 
18147   case Intrinsic::localaddress: {
18148     // Returns one of the stack, base, or frame pointer registers, depending on
18149     // which is used to reference local variables.
18150     MachineFunction &MF = DAG.getMachineFunction();
18151     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18152     unsigned Reg;
18153     if (RegInfo->hasBasePointer(MF))
18154       Reg = RegInfo->getBaseRegister();
18155     else // This function handles the SP or FP case.
18156       Reg = RegInfo->getPtrSizedFrameRegister(MF);
18157     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
18158   }
18159   }
18160 }
18161 
getGatherNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)18162 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18163                               SDValue Src, SDValue Mask, SDValue Base,
18164                               SDValue Index, SDValue ScaleOp, SDValue Chain,
18165                               const X86Subtarget &Subtarget) {
18166   SDLoc dl(Op);
18167   auto *C = cast<ConstantSDNode>(ScaleOp);
18168   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18169   MVT MaskVT = MVT::getVectorVT(MVT::i1,
18170                              Index.getSimpleValueType().getVectorNumElements());
18171 
18172   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18173   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
18174   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18175   SDValue Segment = DAG.getRegister(0, MVT::i32);
18176   if (Src.isUndef())
18177     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
18178   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
18179   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18180   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
18181   return DAG.getMergeValues(RetOps, dl);
18182 }
18183 
getScatterNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Src,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)18184 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18185                                SDValue Src, SDValue Mask, SDValue Base,
18186                                SDValue Index, SDValue ScaleOp, SDValue Chain,
18187                                const X86Subtarget &Subtarget) {
18188   SDLoc dl(Op);
18189   auto *C = cast<ConstantSDNode>(ScaleOp);
18190   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18191   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18192   SDValue Segment = DAG.getRegister(0, MVT::i32);
18193   MVT MaskVT = MVT::getVectorVT(MVT::i1,
18194                              Index.getSimpleValueType().getVectorNumElements());
18195 
18196   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18197   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
18198   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
18199   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18200   return SDValue(Res, 1);
18201 }
18202 
getPrefetchNode(unsigned Opc,SDValue Op,SelectionDAG & DAG,SDValue Mask,SDValue Base,SDValue Index,SDValue ScaleOp,SDValue Chain,const X86Subtarget & Subtarget)18203 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18204                                SDValue Mask, SDValue Base, SDValue Index,
18205                                SDValue ScaleOp, SDValue Chain,
18206                                const X86Subtarget &Subtarget) {
18207   SDLoc dl(Op);
18208   auto *C = cast<ConstantSDNode>(ScaleOp);
18209   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18210   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18211   SDValue Segment = DAG.getRegister(0, MVT::i32);
18212   MVT MaskVT =
18213     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
18214   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18215   //SDVTList VTs = DAG.getVTList(MVT::Other);
18216   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
18217   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
18218   return SDValue(Res, 0);
18219 }
18220 
18221 /// Handles the lowering of builtin intrinsics that read performance monitor
18222 /// counters (x86_rdpmc).
getReadPerformanceCounter(SDNode * N,const SDLoc & DL,SelectionDAG & DAG,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)18223 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
18224                                       SelectionDAG &DAG,
18225                                       const X86Subtarget &Subtarget,
18226                                       SmallVectorImpl<SDValue> &Results) {
18227   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18228   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18229   SDValue LO, HI;
18230 
18231   // The ECX register is used to select the index of the performance counter
18232   // to read.
18233   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
18234                                    N->getOperand(2));
18235   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
18236 
18237   // Reads the content of a 64-bit performance counter and returns it in the
18238   // registers EDX:EAX.
18239   if (Subtarget.is64Bit()) {
18240     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18241     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18242                             LO.getValue(2));
18243   } else {
18244     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18245     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18246                             LO.getValue(2));
18247   }
18248   Chain = HI.getValue(1);
18249 
18250   if (Subtarget.is64Bit()) {
18251     // The EAX register is loaded with the low-order 32 bits. The EDX register
18252     // is loaded with the supported high-order bits of the counter.
18253     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18254                               DAG.getConstant(32, DL, MVT::i8));
18255     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18256     Results.push_back(Chain);
18257     return;
18258   }
18259 
18260   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18261   SDValue Ops[] = { LO, HI };
18262   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18263   Results.push_back(Pair);
18264   Results.push_back(Chain);
18265 }
18266 
18267 /// Handles the lowering of builtin intrinsics that read the time stamp counter
18268 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
18269 /// READCYCLECOUNTER nodes.
getReadTimeStampCounter(SDNode * N,const SDLoc & DL,unsigned Opcode,SelectionDAG & DAG,const X86Subtarget & Subtarget,SmallVectorImpl<SDValue> & Results)18270 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
18271                                     SelectionDAG &DAG,
18272                                     const X86Subtarget &Subtarget,
18273                                     SmallVectorImpl<SDValue> &Results) {
18274   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18275   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
18276   SDValue LO, HI;
18277 
18278   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
18279   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
18280   // and the EAX register is loaded with the low-order 32 bits.
18281   if (Subtarget.is64Bit()) {
18282     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18283     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18284                             LO.getValue(2));
18285   } else {
18286     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18287     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18288                             LO.getValue(2));
18289   }
18290   SDValue Chain = HI.getValue(1);
18291 
18292   if (Opcode == X86ISD::RDTSCP_DAG) {
18293     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18294 
18295     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
18296     // the ECX register. Add 'ecx' explicitly to the chain.
18297     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
18298                                      HI.getValue(2));
18299     // Explicitly store the content of ECX at the location passed in input
18300     // to the 'rdtscp' intrinsic.
18301     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
18302                          MachinePointerInfo(), false, false, 0);
18303   }
18304 
18305   if (Subtarget.is64Bit()) {
18306     // The EDX register is loaded with the high-order 32 bits of the MSR, and
18307     // the EAX register is loaded with the low-order 32 bits.
18308     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18309                               DAG.getConstant(32, DL, MVT::i8));
18310     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18311     Results.push_back(Chain);
18312     return;
18313   }
18314 
18315   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18316   SDValue Ops[] = { LO, HI };
18317   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18318   Results.push_back(Pair);
18319   Results.push_back(Chain);
18320 }
18321 
LowerREADCYCLECOUNTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18322 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
18323                                      SelectionDAG &DAG) {
18324   SmallVector<SDValue, 2> Results;
18325   SDLoc DL(Op);
18326   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
18327                           Results);
18328   return DAG.getMergeValues(Results, DL);
18329 }
18330 
MarkEHRegistrationNode(SDValue Op,SelectionDAG & DAG)18331 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
18332   MachineFunction &MF = DAG.getMachineFunction();
18333   SDValue Chain = Op.getOperand(0);
18334   SDValue RegNode = Op.getOperand(2);
18335   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18336   if (!EHInfo)
18337     report_fatal_error("EH registrations only live in functions using WinEH");
18338 
18339   // Cast the operand to an alloca, and remember the frame index.
18340   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
18341   if (!FINode)
18342     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
18343   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
18344 
18345   // Return the chain operand without making any DAG nodes.
18346   return Chain;
18347 }
18348 
MarkEHGuard(SDValue Op,SelectionDAG & DAG)18349 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
18350   MachineFunction &MF = DAG.getMachineFunction();
18351   SDValue Chain = Op.getOperand(0);
18352   SDValue EHGuard = Op.getOperand(2);
18353   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18354   if (!EHInfo)
18355     report_fatal_error("EHGuard only live in functions using WinEH");
18356 
18357   // Cast the operand to an alloca, and remember the frame index.
18358   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
18359   if (!FINode)
18360     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
18361   EHInfo->EHGuardFrameIndex = FINode->getIndex();
18362 
18363   // Return the chain operand without making any DAG nodes.
18364   return Chain;
18365 }
18366 
LowerINTRINSIC_W_CHAIN(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)18367 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18368                                       SelectionDAG &DAG) {
18369   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
18370 
18371   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
18372   if (!IntrData) {
18373     if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
18374       return MarkEHRegistrationNode(Op, DAG);
18375     if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
18376       return MarkEHGuard(Op, DAG);
18377     if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
18378         IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
18379         IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
18380         IntNo == llvm::Intrinsic::x86_flags_write_u64) {
18381       // We need a frame pointer because this will get lowered to a PUSH/POP
18382       // sequence.
18383       MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18384       MFI->setHasCopyImplyingStackAdjustment(true);
18385       // Don't do anything here, we will expand these intrinsics out later
18386       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
18387       return SDValue();
18388     }
18389     return SDValue();
18390   }
18391 
18392   SDLoc dl(Op);
18393   switch(IntrData->Type) {
18394   default: llvm_unreachable("Unknown Intrinsic Type");
18395   case RDSEED:
18396   case RDRAND: {
18397     // Emit the node with the right value type.
18398     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
18399     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18400 
18401     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
18402     // Otherwise return the value from Rand, which is always 0, casted to i32.
18403     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
18404                       DAG.getConstant(1, dl, Op->getValueType(1)),
18405                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
18406                       SDValue(Result.getNode(), 1) };
18407     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
18408                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
18409                                   Ops);
18410 
18411     // Return { result, isValid, chain }.
18412     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
18413                        SDValue(Result.getNode(), 2));
18414   }
18415   case GATHER: {
18416   //gather(v1, mask, index, base, scale);
18417     SDValue Chain = Op.getOperand(0);
18418     SDValue Src   = Op.getOperand(2);
18419     SDValue Base  = Op.getOperand(3);
18420     SDValue Index = Op.getOperand(4);
18421     SDValue Mask  = Op.getOperand(5);
18422     SDValue Scale = Op.getOperand(6);
18423     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
18424                          Chain, Subtarget);
18425   }
18426   case SCATTER: {
18427   //scatter(base, mask, index, v1, scale);
18428     SDValue Chain = Op.getOperand(0);
18429     SDValue Base  = Op.getOperand(2);
18430     SDValue Mask  = Op.getOperand(3);
18431     SDValue Index = Op.getOperand(4);
18432     SDValue Src   = Op.getOperand(5);
18433     SDValue Scale = Op.getOperand(6);
18434     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
18435                           Scale, Chain, Subtarget);
18436   }
18437   case PREFETCH: {
18438     SDValue Hint = Op.getOperand(6);
18439     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
18440     assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
18441     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
18442     SDValue Chain = Op.getOperand(0);
18443     SDValue Mask  = Op.getOperand(2);
18444     SDValue Index = Op.getOperand(3);
18445     SDValue Base  = Op.getOperand(4);
18446     SDValue Scale = Op.getOperand(5);
18447     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
18448                            Subtarget);
18449   }
18450   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
18451   case RDTSC: {
18452     SmallVector<SDValue, 2> Results;
18453     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
18454                             Results);
18455     return DAG.getMergeValues(Results, dl);
18456   }
18457   // Read Performance Monitoring Counters.
18458   case RDPMC: {
18459     SmallVector<SDValue, 2> Results;
18460     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
18461     return DAG.getMergeValues(Results, dl);
18462   }
18463   // XTEST intrinsics.
18464   case XTEST: {
18465     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18466     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18467     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18468                                 DAG.getConstant(X86::COND_NE, dl, MVT::i8),
18469                                 InTrans);
18470     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
18471     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
18472                        Ret, SDValue(InTrans.getNode(), 1));
18473   }
18474   // ADC/ADCX/SBB
18475   case ADX: {
18476     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18477     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
18478     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
18479                                 DAG.getConstant(-1, dl, MVT::i8));
18480     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
18481                               Op.getOperand(4), GenCF.getValue(1));
18482     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
18483                                  Op.getOperand(5), MachinePointerInfo(),
18484                                  false, false, 0);
18485     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18486                                 DAG.getConstant(X86::COND_B, dl, MVT::i8),
18487                                 Res.getValue(1));
18488     SDValue Results[] = { SetCC, Store };
18489     return DAG.getMergeValues(Results, dl);
18490   }
18491   case COMPRESS_TO_MEM: {
18492     SDValue Mask = Op.getOperand(4);
18493     SDValue DataToCompress = Op.getOperand(3);
18494     SDValue Addr = Op.getOperand(2);
18495     SDValue Chain = Op.getOperand(0);
18496     MVT VT = DataToCompress.getSimpleValueType();
18497 
18498     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18499     assert(MemIntr && "Expected MemIntrinsicSDNode!");
18500 
18501     if (isAllOnesConstant(Mask)) // return just a store
18502       return DAG.getStore(Chain, dl, DataToCompress, Addr,
18503                           MemIntr->getMemOperand());
18504 
18505     SDValue Compressed =
18506       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
18507                            Mask, DAG.getUNDEF(VT), Subtarget, DAG);
18508     return DAG.getStore(Chain, dl, Compressed, Addr,
18509                         MemIntr->getMemOperand());
18510   }
18511   case TRUNCATE_TO_MEM_VI8:
18512   case TRUNCATE_TO_MEM_VI16:
18513   case TRUNCATE_TO_MEM_VI32: {
18514     SDValue Mask = Op.getOperand(4);
18515     SDValue DataToTruncate = Op.getOperand(3);
18516     SDValue Addr = Op.getOperand(2);
18517     SDValue Chain = Op.getOperand(0);
18518 
18519     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18520     assert(MemIntr && "Expected MemIntrinsicSDNode!");
18521 
18522     EVT VT  = MemIntr->getMemoryVT();
18523 
18524     if (isAllOnesConstant(Mask)) // return just a truncate store
18525       return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
18526                                MemIntr->getMemOperand());
18527 
18528     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18529     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18530 
18531     return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
18532                               MemIntr->getMemOperand(), true);
18533   }
18534   case EXPAND_FROM_MEM: {
18535     SDValue Mask = Op.getOperand(4);
18536     SDValue PassThru = Op.getOperand(3);
18537     SDValue Addr = Op.getOperand(2);
18538     SDValue Chain = Op.getOperand(0);
18539     MVT VT = Op.getSimpleValueType();
18540 
18541     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18542     assert(MemIntr && "Expected MemIntrinsicSDNode!");
18543 
18544     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
18545                                        MemIntr->getMemOperand());
18546 
18547     if (isAllOnesConstant(Mask)) // return just a load
18548       return DataToExpand;
18549 
18550     SDValue Results[] = {
18551       getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
18552                            Mask, PassThru, Subtarget, DAG), Chain};
18553     return DAG.getMergeValues(Results, dl);
18554   }
18555   }
18556 }
18557 
LowerRETURNADDR(SDValue Op,SelectionDAG & DAG) const18558 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
18559                                            SelectionDAG &DAG) const {
18560   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18561   MFI->setReturnAddressIsTaken(true);
18562 
18563   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
18564     return SDValue();
18565 
18566   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18567   SDLoc dl(Op);
18568   EVT PtrVT = getPointerTy(DAG.getDataLayout());
18569 
18570   if (Depth > 0) {
18571     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
18572     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18573     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
18574     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18575                        DAG.getNode(ISD::ADD, dl, PtrVT,
18576                                    FrameAddr, Offset),
18577                        MachinePointerInfo(), false, false, false, 0);
18578   }
18579 
18580   // Just load the return address.
18581   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
18582   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18583                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
18584 }
18585 
LowerFRAMEADDR(SDValue Op,SelectionDAG & DAG) const18586 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
18587   MachineFunction &MF = DAG.getMachineFunction();
18588   MachineFrameInfo *MFI = MF.getFrameInfo();
18589   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18590   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18591   EVT VT = Op.getValueType();
18592 
18593   MFI->setFrameAddressIsTaken(true);
18594 
18595   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
18596     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
18597     // is not possible to crawl up the stack without looking at the unwind codes
18598     // simultaneously.
18599     int FrameAddrIndex = FuncInfo->getFAIndex();
18600     if (!FrameAddrIndex) {
18601       // Set up a frame object for the return address.
18602       unsigned SlotSize = RegInfo->getSlotSize();
18603       FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
18604           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
18605       FuncInfo->setFAIndex(FrameAddrIndex);
18606     }
18607     return DAG.getFrameIndex(FrameAddrIndex, VT);
18608   }
18609 
18610   unsigned FrameReg =
18611       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18612   SDLoc dl(Op);  // FIXME probably not meaningful
18613   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18614   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
18615           (FrameReg == X86::EBP && VT == MVT::i32)) &&
18616          "Invalid Frame Register!");
18617   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
18618   while (Depth--)
18619     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
18620                             MachinePointerInfo(),
18621                             false, false, false, 0);
18622   return FrameAddr;
18623 }
18624 
18625 // FIXME? Maybe this could be a TableGen attribute on some registers and
18626 // this table could be generated automatically from RegInfo.
getRegisterByName(const char * RegName,EVT VT,SelectionDAG & DAG) const18627 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
18628                                               SelectionDAG &DAG) const {
18629   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18630   const MachineFunction &MF = DAG.getMachineFunction();
18631 
18632   unsigned Reg = StringSwitch<unsigned>(RegName)
18633                        .Case("esp", X86::ESP)
18634                        .Case("rsp", X86::RSP)
18635                        .Case("ebp", X86::EBP)
18636                        .Case("rbp", X86::RBP)
18637                        .Default(0);
18638 
18639   if (Reg == X86::EBP || Reg == X86::RBP) {
18640     if (!TFI.hasFP(MF))
18641       report_fatal_error("register " + StringRef(RegName) +
18642                          " is allocatable: function has no frame pointer");
18643 #ifndef NDEBUG
18644     else {
18645       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18646       unsigned FrameReg =
18647           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18648       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
18649              "Invalid Frame Register!");
18650     }
18651 #endif
18652   }
18653 
18654   if (Reg)
18655     return Reg;
18656 
18657   report_fatal_error("Invalid register name global variable");
18658 }
18659 
LowerFRAME_TO_ARGS_OFFSET(SDValue Op,SelectionDAG & DAG) const18660 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18661                                                      SelectionDAG &DAG) const {
18662   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18663   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
18664 }
18665 
getExceptionPointerRegister(const Constant * PersonalityFn) const18666 unsigned X86TargetLowering::getExceptionPointerRegister(
18667     const Constant *PersonalityFn) const {
18668   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
18669     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18670 
18671   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
18672 }
18673 
getExceptionSelectorRegister(const Constant * PersonalityFn) const18674 unsigned X86TargetLowering::getExceptionSelectorRegister(
18675     const Constant *PersonalityFn) const {
18676   // Funclet personalities don't use selectors (the runtime does the selection).
18677   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
18678   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18679 }
18680 
needsFixedCatchObjects() const18681 bool X86TargetLowering::needsFixedCatchObjects() const {
18682   return Subtarget.isTargetWin64();
18683 }
18684 
LowerEH_RETURN(SDValue Op,SelectionDAG & DAG) const18685 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18686   SDValue Chain     = Op.getOperand(0);
18687   SDValue Offset    = Op.getOperand(1);
18688   SDValue Handler   = Op.getOperand(2);
18689   SDLoc dl      (Op);
18690 
18691   EVT PtrVT = getPointerTy(DAG.getDataLayout());
18692   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18693   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18694   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18695           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18696          "Invalid Frame Register!");
18697   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18698   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18699 
18700   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18701                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
18702                                                        dl));
18703   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18704   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18705                        false, false, 0);
18706   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18707 
18708   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18709                      DAG.getRegister(StoreAddrReg, PtrVT));
18710 }
18711 
lowerEH_SJLJ_SETJMP(SDValue Op,SelectionDAG & DAG) const18712 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18713                                                SelectionDAG &DAG) const {
18714   SDLoc DL(Op);
18715   // If the subtarget is not 64bit, we may need the global base reg
18716   // after isel expand pseudo, i.e., after CGBR pass ran.
18717   // Therefore, ask for the GlobalBaseReg now, so that the pass
18718   // inserts the code for us in case we need it.
18719   // Otherwise, we will end up in a situation where we will
18720   // reference a virtual register that is not defined!
18721   if (!Subtarget.is64Bit()) {
18722     const X86InstrInfo *TII = Subtarget.getInstrInfo();
18723     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
18724   }
18725   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18726                      DAG.getVTList(MVT::i32, MVT::Other),
18727                      Op.getOperand(0), Op.getOperand(1));
18728 }
18729 
lowerEH_SJLJ_LONGJMP(SDValue Op,SelectionDAG & DAG) const18730 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18731                                                 SelectionDAG &DAG) const {
18732   SDLoc DL(Op);
18733   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18734                      Op.getOperand(0), Op.getOperand(1));
18735 }
18736 
lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,SelectionDAG & DAG) const18737 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
18738                                                        SelectionDAG &DAG) const {
18739   SDLoc DL(Op);
18740   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
18741                      Op.getOperand(0));
18742 }
18743 
LowerADJUST_TRAMPOLINE(SDValue Op,SelectionDAG & DAG)18744 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18745   return Op.getOperand(0);
18746 }
18747 
LowerINIT_TRAMPOLINE(SDValue Op,SelectionDAG & DAG) const18748 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18749                                                 SelectionDAG &DAG) const {
18750   SDValue Root = Op.getOperand(0);
18751   SDValue Trmp = Op.getOperand(1); // trampoline
18752   SDValue FPtr = Op.getOperand(2); // nested function
18753   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18754   SDLoc dl (Op);
18755 
18756   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18757   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
18758 
18759   if (Subtarget.is64Bit()) {
18760     SDValue OutChains[6];
18761 
18762     // Large code-model.
18763     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18764     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18765 
18766     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18767     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18768 
18769     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18770 
18771     // Load the pointer to the nested function into R11.
18772     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18773     SDValue Addr = Trmp;
18774     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18775                                 Addr, MachinePointerInfo(TrmpAddr),
18776                                 false, false, 0);
18777 
18778     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18779                        DAG.getConstant(2, dl, MVT::i64));
18780     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18781                                 MachinePointerInfo(TrmpAddr, 2),
18782                                 false, false, 2);
18783 
18784     // Load the 'nest' parameter value into R10.
18785     // R10 is specified in X86CallingConv.td
18786     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18787     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18788                        DAG.getConstant(10, dl, MVT::i64));
18789     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18790                                 Addr, MachinePointerInfo(TrmpAddr, 10),
18791                                 false, false, 0);
18792 
18793     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18794                        DAG.getConstant(12, dl, MVT::i64));
18795     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18796                                 MachinePointerInfo(TrmpAddr, 12),
18797                                 false, false, 2);
18798 
18799     // Jump to the nested function.
18800     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18801     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18802                        DAG.getConstant(20, dl, MVT::i64));
18803     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18804                                 Addr, MachinePointerInfo(TrmpAddr, 20),
18805                                 false, false, 0);
18806 
18807     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18808     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18809                        DAG.getConstant(22, dl, MVT::i64));
18810     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
18811                                 Addr, MachinePointerInfo(TrmpAddr, 22),
18812                                 false, false, 0);
18813 
18814     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18815   } else {
18816     const Function *Func =
18817       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18818     CallingConv::ID CC = Func->getCallingConv();
18819     unsigned NestReg;
18820 
18821     switch (CC) {
18822     default:
18823       llvm_unreachable("Unsupported calling convention");
18824     case CallingConv::C:
18825     case CallingConv::X86_StdCall: {
18826       // Pass 'nest' parameter in ECX.
18827       // Must be kept in sync with X86CallingConv.td
18828       NestReg = X86::ECX;
18829 
18830       // Check that ECX wasn't needed by an 'inreg' parameter.
18831       FunctionType *FTy = Func->getFunctionType();
18832       const AttributeSet &Attrs = Func->getAttributes();
18833 
18834       if (!Attrs.isEmpty() && !Func->isVarArg()) {
18835         unsigned InRegCount = 0;
18836         unsigned Idx = 1;
18837 
18838         for (FunctionType::param_iterator I = FTy->param_begin(),
18839              E = FTy->param_end(); I != E; ++I, ++Idx)
18840           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
18841             auto &DL = DAG.getDataLayout();
18842             // FIXME: should only count parameters that are lowered to integers.
18843             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
18844           }
18845 
18846         if (InRegCount > 2) {
18847           report_fatal_error("Nest register in use - reduce number of inreg"
18848                              " parameters!");
18849         }
18850       }
18851       break;
18852     }
18853     case CallingConv::X86_FastCall:
18854     case CallingConv::X86_ThisCall:
18855     case CallingConv::Fast:
18856       // Pass 'nest' parameter in EAX.
18857       // Must be kept in sync with X86CallingConv.td
18858       NestReg = X86::EAX;
18859       break;
18860     }
18861 
18862     SDValue OutChains[4];
18863     SDValue Addr, Disp;
18864 
18865     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18866                        DAG.getConstant(10, dl, MVT::i32));
18867     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18868 
18869     // This is storing the opcode for MOV32ri.
18870     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18871     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18872     OutChains[0] = DAG.getStore(Root, dl,
18873                                 DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
18874                                 Trmp, MachinePointerInfo(TrmpAddr),
18875                                 false, false, 0);
18876 
18877     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18878                        DAG.getConstant(1, dl, MVT::i32));
18879     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18880                                 MachinePointerInfo(TrmpAddr, 1),
18881                                 false, false, 1);
18882 
18883     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18884     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18885                        DAG.getConstant(5, dl, MVT::i32));
18886     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
18887                                 Addr, MachinePointerInfo(TrmpAddr, 5),
18888                                 false, false, 1);
18889 
18890     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18891                        DAG.getConstant(6, dl, MVT::i32));
18892     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18893                                 MachinePointerInfo(TrmpAddr, 6),
18894                                 false, false, 1);
18895 
18896     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18897   }
18898 }
18899 
LowerFLT_ROUNDS_(SDValue Op,SelectionDAG & DAG) const18900 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18901                                             SelectionDAG &DAG) const {
18902   /*
18903    The rounding mode is in bits 11:10 of FPSR, and has the following
18904    settings:
18905      00 Round to nearest
18906      01 Round to -inf
18907      10 Round to +inf
18908      11 Round to 0
18909 
18910   FLT_ROUNDS, on the other hand, expects the following:
18911     -1 Undefined
18912      0 Round to 0
18913      1 Round to nearest
18914      2 Round to +inf
18915      3 Round to -inf
18916 
18917   To perform the conversion, we do:
18918     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18919   */
18920 
18921   MachineFunction &MF = DAG.getMachineFunction();
18922   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18923   unsigned StackAlignment = TFI.getStackAlignment();
18924   MVT VT = Op.getSimpleValueType();
18925   SDLoc DL(Op);
18926 
18927   // Save FP Control Word to stack slot
18928   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18929   SDValue StackSlot =
18930       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
18931 
18932   MachineMemOperand *MMO =
18933       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
18934                               MachineMemOperand::MOStore, 2, 2);
18935 
18936   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18937   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18938                                           DAG.getVTList(MVT::Other),
18939                                           Ops, MVT::i16, MMO);
18940 
18941   // Load FP Control Word from stack slot
18942   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18943                             MachinePointerInfo(), false, false, false, 0);
18944 
18945   // Transform as necessary
18946   SDValue CWD1 =
18947     DAG.getNode(ISD::SRL, DL, MVT::i16,
18948                 DAG.getNode(ISD::AND, DL, MVT::i16,
18949                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
18950                 DAG.getConstant(11, DL, MVT::i8));
18951   SDValue CWD2 =
18952     DAG.getNode(ISD::SRL, DL, MVT::i16,
18953                 DAG.getNode(ISD::AND, DL, MVT::i16,
18954                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
18955                 DAG.getConstant(9, DL, MVT::i8));
18956 
18957   SDValue RetVal =
18958     DAG.getNode(ISD::AND, DL, MVT::i16,
18959                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18960                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18961                             DAG.getConstant(1, DL, MVT::i16)),
18962                 DAG.getConstant(3, DL, MVT::i16));
18963 
18964   return DAG.getNode((VT.getSizeInBits() < 16 ?
18965                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18966 }
18967 
18968 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
18969 //
18970 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
18971 //    to 512-bit vector.
18972 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
18973 //    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
18974 //    split the vector, perform operation on it's Lo a Hi part and
18975 //    concatenate the results.
LowerVectorCTLZ_AVX512(SDValue Op,SelectionDAG & DAG)18976 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
18977   assert(Op.getOpcode() == ISD::CTLZ);
18978   SDLoc dl(Op);
18979   MVT VT = Op.getSimpleValueType();
18980   MVT EltVT = VT.getVectorElementType();
18981   unsigned NumElems = VT.getVectorNumElements();
18982 
18983   if (EltVT == MVT::i64 || EltVT == MVT::i32) {
18984     // Extend to 512 bit vector.
18985     assert((VT.is256BitVector() || VT.is128BitVector()) &&
18986               "Unsupported value type for operation");
18987 
18988     MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
18989     SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
18990                                  DAG.getUNDEF(NewVT),
18991                                  Op.getOperand(0),
18992                                  DAG.getIntPtrConstant(0, dl));
18993     SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
18994 
18995     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
18996                        DAG.getIntPtrConstant(0, dl));
18997   }
18998 
18999   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
19000           "Unsupported element type");
19001 
19002   if (16 < NumElems) {
19003     // Split vector, it's Lo and Hi parts will be handled in next iteration.
19004     SDValue Lo, Hi;
19005     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
19006     MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
19007 
19008     Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
19009     Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
19010 
19011     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
19012   }
19013 
19014   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
19015 
19016   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
19017           "Unsupported value type for operation");
19018 
19019   // Use native supported vector instruction vplzcntd.
19020   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
19021   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
19022   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
19023   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
19024 
19025   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
19026 }
19027 
19028 // Lower CTLZ using a PSHUFB lookup table implementation.
LowerVectorCTLZInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)19029 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
19030                                        const X86Subtarget &Subtarget,
19031                                        SelectionDAG &DAG) {
19032   MVT VT = Op.getSimpleValueType();
19033   int NumElts = VT.getVectorNumElements();
19034   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
19035   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
19036 
19037   // Per-nibble leading zero PSHUFB lookup table.
19038   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
19039                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
19040                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
19041                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
19042 
19043   SmallVector<SDValue, 64> LUTVec;
19044   for (int i = 0; i < NumBytes; ++i)
19045     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
19046   SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
19047 
19048   // Begin by bitcasting the input to byte vector, then split those bytes
19049   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
19050   // If the hi input nibble is zero then we add both results together, otherwise
19051   // we just take the hi result (by masking the lo result to zero before the
19052   // add).
19053   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
19054   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
19055 
19056   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
19057   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
19058   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
19059   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
19060   SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
19061 
19062   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
19063   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
19064   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
19065   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
19066 
19067   // Merge result back from vXi8 back to VT, working on the lo/hi halves
19068   // of the current vector width in the same way we did for the nibbles.
19069   // If the upper half of the input element is zero then add the halves'
19070   // leading zero counts together, otherwise just use the upper half's.
19071   // Double the width of the result until we are at target width.
19072   while (CurrVT != VT) {
19073     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
19074     int CurrNumElts = CurrVT.getVectorNumElements();
19075     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
19076     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
19077     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
19078 
19079     // Check if the upper half of the input element is zero.
19080     SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
19081                                DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
19082     HiZ = DAG.getBitcast(NextVT, HiZ);
19083 
19084     // Move the upper/lower halves to the lower bits as we'll be extending to
19085     // NextVT. Mask the lower result to zero if HiZ is true and add the results
19086     // together.
19087     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
19088     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
19089     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
19090     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
19091     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
19092     CurrVT = NextVT;
19093   }
19094 
19095   return Res;
19096 }
19097 
LowerVectorCTLZ(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)19098 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
19099                                const X86Subtarget &Subtarget,
19100                                SelectionDAG &DAG) {
19101   MVT VT = Op.getSimpleValueType();
19102   SDValue Op0 = Op.getOperand(0);
19103 
19104   if (Subtarget.hasAVX512())
19105     return LowerVectorCTLZ_AVX512(Op, DAG);
19106 
19107   // Decompose 256-bit ops into smaller 128-bit ops.
19108   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19109     unsigned NumElems = VT.getVectorNumElements();
19110 
19111     // Extract each 128-bit vector, perform ctlz and concat the result.
19112     SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
19113     SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
19114 
19115     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
19116                        DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
19117                        DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
19118   }
19119 
19120   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
19121   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
19122 }
19123 
LowerCTLZ(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19124 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
19125                          SelectionDAG &DAG) {
19126   MVT VT = Op.getSimpleValueType();
19127   MVT OpVT = VT;
19128   unsigned NumBits = VT.getSizeInBits();
19129   SDLoc dl(Op);
19130   unsigned Opc = Op.getOpcode();
19131 
19132   if (VT.isVector())
19133     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
19134 
19135   Op = Op.getOperand(0);
19136   if (VT == MVT::i8) {
19137     // Zero extend to i32 since there is not an i8 bsr.
19138     OpVT = MVT::i32;
19139     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
19140   }
19141 
19142   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
19143   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
19144   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
19145 
19146   if (Opc == ISD::CTLZ) {
19147     // If src is zero (i.e. bsr sets ZF), returns NumBits.
19148     SDValue Ops[] = {
19149       Op,
19150       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
19151       DAG.getConstant(X86::COND_E, dl, MVT::i8),
19152       Op.getValue(1)
19153     };
19154     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
19155   }
19156 
19157   // Finally xor with NumBits-1.
19158   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
19159                    DAG.getConstant(NumBits - 1, dl, OpVT));
19160 
19161   if (VT == MVT::i8)
19162     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
19163   return Op;
19164 }
19165 
LowerCTTZ(SDValue Op,SelectionDAG & DAG)19166 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
19167   MVT VT = Op.getSimpleValueType();
19168   unsigned NumBits = VT.getScalarSizeInBits();
19169   SDLoc dl(Op);
19170 
19171   if (VT.isVector()) {
19172     SDValue N0 = Op.getOperand(0);
19173     SDValue Zero = DAG.getConstant(0, dl, VT);
19174 
19175     // lsb(x) = (x & -x)
19176     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
19177                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
19178 
19179     // cttz_undef(x) = (width - 1) - ctlz(lsb)
19180     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
19181       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
19182       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
19183                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
19184     }
19185 
19186     // cttz(x) = ctpop(lsb - 1)
19187     SDValue One = DAG.getConstant(1, dl, VT);
19188     return DAG.getNode(ISD::CTPOP, dl, VT,
19189                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
19190   }
19191 
19192   assert(Op.getOpcode() == ISD::CTTZ &&
19193          "Only scalar CTTZ requires custom lowering");
19194 
19195   // Issue a bsf (scan bits forward) which also sets EFLAGS.
19196   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19197   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
19198 
19199   // If src is zero (i.e. bsf sets ZF), returns NumBits.
19200   SDValue Ops[] = {
19201     Op,
19202     DAG.getConstant(NumBits, dl, VT),
19203     DAG.getConstant(X86::COND_E, dl, MVT::i8),
19204     Op.getValue(1)
19205   };
19206   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
19207 }
19208 
19209 /// Break a 256-bit integer operation into two new 128-bit ones and then
19210 /// concatenate the result back.
Lower256IntArith(SDValue Op,SelectionDAG & DAG)19211 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
19212   MVT VT = Op.getSimpleValueType();
19213 
19214   assert(VT.is256BitVector() && VT.isInteger() &&
19215          "Unsupported value type for operation");
19216 
19217   unsigned NumElems = VT.getVectorNumElements();
19218   SDLoc dl(Op);
19219 
19220   // Extract the LHS vectors
19221   SDValue LHS = Op.getOperand(0);
19222   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
19223   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
19224 
19225   // Extract the RHS vectors
19226   SDValue RHS = Op.getOperand(1);
19227   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
19228   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
19229 
19230   MVT EltVT = VT.getVectorElementType();
19231   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19232 
19233   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19234                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19235                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19236 }
19237 
19238 /// Break a 512-bit integer operation into two new 256-bit ones and then
19239 /// concatenate the result back.
Lower512IntArith(SDValue Op,SelectionDAG & DAG)19240 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
19241   MVT VT = Op.getSimpleValueType();
19242 
19243   assert(VT.is512BitVector() && VT.isInteger() &&
19244          "Unsupported value type for operation");
19245 
19246   unsigned NumElems = VT.getVectorNumElements();
19247   SDLoc dl(Op);
19248 
19249   // Extract the LHS vectors
19250   SDValue LHS = Op.getOperand(0);
19251   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
19252   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
19253 
19254   // Extract the RHS vectors
19255   SDValue RHS = Op.getOperand(1);
19256   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
19257   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
19258 
19259   MVT EltVT = VT.getVectorElementType();
19260   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19261 
19262   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19263                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19264                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19265 }
19266 
LowerADD(SDValue Op,SelectionDAG & DAG)19267 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
19268   if (Op.getValueType() == MVT::i1)
19269     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19270                        Op.getOperand(0), Op.getOperand(1));
19271   assert(Op.getSimpleValueType().is256BitVector() &&
19272          Op.getSimpleValueType().isInteger() &&
19273          "Only handle AVX 256-bit vector integer operation");
19274   return Lower256IntArith(Op, DAG);
19275 }
19276 
LowerSUB(SDValue Op,SelectionDAG & DAG)19277 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
19278   if (Op.getValueType() == MVT::i1)
19279     return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19280                        Op.getOperand(0), Op.getOperand(1));
19281   assert(Op.getSimpleValueType().is256BitVector() &&
19282          Op.getSimpleValueType().isInteger() &&
19283          "Only handle AVX 256-bit vector integer operation");
19284   return Lower256IntArith(Op, DAG);
19285 }
19286 
LowerMINMAX(SDValue Op,SelectionDAG & DAG)19287 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
19288   assert(Op.getSimpleValueType().is256BitVector() &&
19289          Op.getSimpleValueType().isInteger() &&
19290          "Only handle AVX 256-bit vector integer operation");
19291   return Lower256IntArith(Op, DAG);
19292 }
19293 
LowerMUL(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19294 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
19295                         SelectionDAG &DAG) {
19296   SDLoc dl(Op);
19297   MVT VT = Op.getSimpleValueType();
19298 
19299   if (VT == MVT::i1)
19300     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
19301 
19302   // Decompose 256-bit ops into smaller 128-bit ops.
19303   if (VT.is256BitVector() && !Subtarget.hasInt256())
19304     return Lower256IntArith(Op, DAG);
19305 
19306   SDValue A = Op.getOperand(0);
19307   SDValue B = Op.getOperand(1);
19308 
19309   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
19310   // vector pairs, multiply and truncate.
19311   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
19312     if (Subtarget.hasInt256()) {
19313       // For 512-bit vectors, split into 256-bit vectors to allow the
19314       // sign-extension to occur.
19315       if (VT == MVT::v64i8)
19316         return Lower512IntArith(Op, DAG);
19317 
19318       // For 256-bit vectors, split into 128-bit vectors to allow the
19319       // sign-extension to occur. We don't need this on AVX512BW as we can
19320       // safely sign-extend to v32i16.
19321       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
19322         return Lower256IntArith(Op, DAG);
19323 
19324       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
19325       return DAG.getNode(
19326           ISD::TRUNCATE, dl, VT,
19327           DAG.getNode(ISD::MUL, dl, ExVT,
19328                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
19329                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
19330     }
19331 
19332     assert(VT == MVT::v16i8 &&
19333            "Pre-AVX2 support only supports v16i8 multiplication");
19334     MVT ExVT = MVT::v8i16;
19335 
19336     // Extract the lo parts and sign extend to i16
19337     SDValue ALo, BLo;
19338     if (Subtarget.hasSSE41()) {
19339       ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
19340       BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
19341     } else {
19342       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19343                               -1, 4, -1, 5, -1, 6, -1, 7};
19344       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19345       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19346       ALo = DAG.getBitcast(ExVT, ALo);
19347       BLo = DAG.getBitcast(ExVT, BLo);
19348       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19349       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19350     }
19351 
19352     // Extract the hi parts and sign extend to i16
19353     SDValue AHi, BHi;
19354     if (Subtarget.hasSSE41()) {
19355       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19356                               -1, -1, -1, -1, -1, -1, -1, -1};
19357       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19358       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19359       AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
19360       BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
19361     } else {
19362       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
19363                               -1, 12, -1, 13, -1, 14, -1, 15};
19364       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19365       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19366       AHi = DAG.getBitcast(ExVT, AHi);
19367       BHi = DAG.getBitcast(ExVT, BHi);
19368       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19369       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19370     }
19371 
19372     // Multiply, mask the lower 8bits of the lo/hi results and pack
19373     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19374     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19375     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
19376     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
19377     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19378   }
19379 
19380   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
19381   if (VT == MVT::v4i32) {
19382     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
19383            "Should not custom lower when pmuldq is available!");
19384 
19385     // Extract the odd parts.
19386     static const int UnpackMask[] = { 1, -1, 3, -1 };
19387     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
19388     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
19389 
19390     // Multiply the even parts.
19391     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
19392     // Now multiply odd parts.
19393     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
19394 
19395     Evens = DAG.getBitcast(VT, Evens);
19396     Odds = DAG.getBitcast(VT, Odds);
19397 
19398     // Merge the two vectors back together with a shuffle. This expands into 2
19399     // shuffles.
19400     static const int ShufMask[] = { 0, 4, 2, 6 };
19401     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
19402   }
19403 
19404   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
19405          "Only know how to lower V2I64/V4I64/V8I64 multiply");
19406 
19407   //  Ahi = psrlqi(a, 32);
19408   //  Bhi = psrlqi(b, 32);
19409   //
19410   //  AloBlo = pmuludq(a, b);
19411   //  AloBhi = pmuludq(a, Bhi);
19412   //  AhiBlo = pmuludq(Ahi, b);
19413 
19414   //  AloBhi = psllqi(AloBhi, 32);
19415   //  AhiBlo = psllqi(AhiBlo, 32);
19416   //  return AloBlo + AloBhi + AhiBlo;
19417 
19418   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
19419   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
19420 
19421   SDValue AhiBlo = Ahi;
19422   SDValue AloBhi = Bhi;
19423   // Bit cast to 32-bit vectors for MULUDQ
19424   MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
19425                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
19426   A = DAG.getBitcast(MulVT, A);
19427   B = DAG.getBitcast(MulVT, B);
19428   Ahi = DAG.getBitcast(MulVT, Ahi);
19429   Bhi = DAG.getBitcast(MulVT, Bhi);
19430 
19431   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
19432   // After shifting right const values the result may be all-zero.
19433   if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
19434     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
19435     AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
19436   }
19437   if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
19438     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
19439     AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
19440   }
19441 
19442   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
19443   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
19444 }
19445 
LowerMULH(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19446 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
19447                          SelectionDAG &DAG) {
19448   SDLoc dl(Op);
19449   MVT VT = Op.getSimpleValueType();
19450 
19451   // Decompose 256-bit ops into smaller 128-bit ops.
19452   if (VT.is256BitVector() && !Subtarget.hasInt256())
19453     return Lower256IntArith(Op, DAG);
19454 
19455   // Only i8 vectors should need custom lowering after this.
19456   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
19457          "Unsupported vector type");
19458 
19459   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
19460   // logical shift down the upper half and pack back to i8.
19461   SDValue A = Op.getOperand(0);
19462   SDValue B = Op.getOperand(1);
19463 
19464   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
19465   // and then ashr/lshr the upper bits down to the lower bits before multiply.
19466   unsigned Opcode = Op.getOpcode();
19467   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
19468   unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
19469 
19470   // AVX2 implementations - extend xmm subvectors to ymm.
19471   if (Subtarget.hasInt256()) {
19472     SDValue Lo = DAG.getIntPtrConstant(0, dl);
19473     SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
19474 
19475     if (VT == MVT::v32i8) {
19476       SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
19477       SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
19478       SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
19479       SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
19480       ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
19481       BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
19482       AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
19483       BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
19484       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19485                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
19486                        DAG.getConstant(8, dl, MVT::v16i16));
19487       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19488                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
19489                        DAG.getConstant(8, dl, MVT::v16i16));
19490       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
19491       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
19492       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
19493                             16, 17, 18, 19, 20, 21, 22, 23};
19494       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19495                             24, 25, 26, 27, 28, 29, 30, 31};
19496       return DAG.getNode(X86ISD::PACKUS, dl, VT,
19497                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
19498                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
19499     }
19500 
19501     SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
19502     SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
19503     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
19504     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
19505                                DAG.getConstant(8, dl, MVT::v16i16));
19506     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
19507     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
19508     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
19509   }
19510 
19511   assert(VT == MVT::v16i8 &&
19512          "Pre-AVX2 support only supports v16i8 multiplication");
19513   MVT ExVT = MVT::v8i16;
19514 
19515   // Extract the lo parts and zero/sign extend to i16.
19516   SDValue ALo, BLo;
19517   if (Subtarget.hasSSE41()) {
19518     ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
19519     BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
19520   } else {
19521     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19522                             -1, 4, -1, 5, -1, 6, -1, 7};
19523     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19524     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19525     ALo = DAG.getBitcast(ExVT, ALo);
19526     BLo = DAG.getBitcast(ExVT, BLo);
19527     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19528     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19529   }
19530 
19531   // Extract the hi parts and zero/sign extend to i16.
19532   SDValue AHi, BHi;
19533   if (Subtarget.hasSSE41()) {
19534     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19535                             -1, -1, -1, -1, -1, -1, -1, -1};
19536     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19537     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19538     AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
19539     BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
19540   } else {
19541     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
19542                             -1, 12, -1, 13, -1, 14, -1, 15};
19543     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19544     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19545     AHi = DAG.getBitcast(ExVT, AHi);
19546     BHi = DAG.getBitcast(ExVT, BHi);
19547     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19548     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19549   }
19550 
19551   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
19552   // pack back to v16i8.
19553   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19554   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19555   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
19556   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
19557   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19558 }
19559 
LowerWin64_i128OP(SDValue Op,SelectionDAG & DAG) const19560 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
19561   assert(Subtarget.isTargetWin64() && "Unexpected target");
19562   EVT VT = Op.getValueType();
19563   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
19564          "Unexpected return type for lowering");
19565 
19566   RTLIB::Libcall LC;
19567   bool isSigned;
19568   switch (Op->getOpcode()) {
19569   default: llvm_unreachable("Unexpected request for libcall!");
19570   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
19571   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
19572   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
19573   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
19574   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
19575   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
19576   }
19577 
19578   SDLoc dl(Op);
19579   SDValue InChain = DAG.getEntryNode();
19580 
19581   TargetLowering::ArgListTy Args;
19582   TargetLowering::ArgListEntry Entry;
19583   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
19584     EVT ArgVT = Op->getOperand(i).getValueType();
19585     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
19586            "Unexpected argument type for lowering");
19587     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
19588     Entry.Node = StackPtr;
19589     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
19590                            false, false, 16);
19591     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19592     Entry.Ty = PointerType::get(ArgTy,0);
19593     Entry.isSExt = false;
19594     Entry.isZExt = false;
19595     Args.push_back(Entry);
19596   }
19597 
19598   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
19599                                          getPointerTy(DAG.getDataLayout()));
19600 
19601   TargetLowering::CallLoweringInfo CLI(DAG);
19602   CLI.setDebugLoc(dl).setChain(InChain)
19603     .setCallee(getLibcallCallingConv(LC),
19604                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
19605                Callee, std::move(Args))
19606     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
19607 
19608   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
19609   return DAG.getBitcast(VT, CallInfo.first);
19610 }
19611 
LowerMUL_LOHI(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)19612 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
19613                              SelectionDAG &DAG) {
19614   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
19615   MVT VT = Op0.getSimpleValueType();
19616   SDLoc dl(Op);
19617 
19618   // Decompose 256-bit ops into smaller 128-bit ops.
19619   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19620     unsigned Opcode = Op.getOpcode();
19621     unsigned NumElems = VT.getVectorNumElements();
19622     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
19623     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
19624     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
19625     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
19626     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
19627     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
19628     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
19629     SDValue Ops[] = {
19630       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
19631       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
19632     };
19633     return DAG.getMergeValues(Ops, dl);
19634   }
19635 
19636   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
19637          (VT == MVT::v8i32 && Subtarget.hasInt256()));
19638 
19639   // PMULxD operations multiply each even value (starting at 0) of LHS with
19640   // the related value of RHS and produce a widen result.
19641   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19642   // => <2 x i64> <ae|cg>
19643   //
19644   // In other word, to have all the results, we need to perform two PMULxD:
19645   // 1. one with the even values.
19646   // 2. one with the odd values.
19647   // To achieve #2, with need to place the odd values at an even position.
19648   //
19649   // Place the odd value at an even position (basically, shift all values 1
19650   // step to the left):
19651   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
19652   // <a|b|c|d> => <b|undef|d|undef>
19653   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
19654                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19655   // <e|f|g|h> => <f|undef|h|undef>
19656   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
19657                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19658 
19659   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
19660   // ints.
19661   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
19662   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
19663   unsigned Opcode =
19664       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
19665   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19666   // => <2 x i64> <ae|cg>
19667   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
19668   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
19669   // => <2 x i64> <bf|dh>
19670   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
19671 
19672   // Shuffle it back into the right order.
19673   SDValue Highs, Lows;
19674   if (VT == MVT::v8i32) {
19675     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
19676     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19677     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
19678     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19679   } else {
19680     const int HighMask[] = {1, 5, 3, 7};
19681     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19682     const int LowMask[] = {0, 4, 2, 6};
19683     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19684   }
19685 
19686   // If we have a signed multiply but no PMULDQ fix up the high parts of a
19687   // unsigned multiply.
19688   if (IsSigned && !Subtarget.hasSSE41()) {
19689     SDValue ShAmt = DAG.getConstant(
19690         31, dl,
19691         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
19692     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
19693                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
19694     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
19695                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
19696 
19697     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
19698     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
19699   }
19700 
19701   // The first result of MUL_LOHI is actually the low value, followed by the
19702   // high value.
19703   SDValue Ops[] = {Lows, Highs};
19704   return DAG.getMergeValues(Ops, dl);
19705 }
19706 
19707 // Return true if the required (according to Opcode) shift-imm form is natively
19708 // supported by the Subtarget
SupportedVectorShiftWithImm(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)19709 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
19710                                         unsigned Opcode) {
19711   if (VT.getScalarSizeInBits() < 16)
19712     return false;
19713 
19714   if (VT.is512BitVector() &&
19715       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
19716     return true;
19717 
19718   bool LShift = VT.is128BitVector() ||
19719     (VT.is256BitVector() && Subtarget.hasInt256());
19720 
19721   bool AShift = LShift && (Subtarget.hasVLX() ||
19722     (VT != MVT::v2i64 && VT != MVT::v4i64));
19723   return (Opcode == ISD::SRA) ? AShift : LShift;
19724 }
19725 
19726 // The shift amount is a variable, but it is the same for all vector lanes.
19727 // These instructions are defined together with shift-immediate.
19728 static
SupportedVectorShiftWithBaseAmnt(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)19729 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
19730                                       unsigned Opcode) {
19731   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
19732 }
19733 
19734 // Return true if the required (according to Opcode) variable-shift form is
19735 // natively supported by the Subtarget
SupportedVectorVarShift(MVT VT,const X86Subtarget & Subtarget,unsigned Opcode)19736 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
19737                                     unsigned Opcode) {
19738 
19739   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
19740     return false;
19741 
19742   // vXi16 supported only on AVX-512, BWI
19743   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
19744     return false;
19745 
19746   if (VT.is512BitVector() || Subtarget.hasVLX())
19747     return true;
19748 
19749   bool LShift = VT.is128BitVector() || VT.is256BitVector();
19750   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
19751   return (Opcode == ISD::SRA) ? AShift : LShift;
19752 }
19753 
LowerScalarImmediateShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19754 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
19755                                          const X86Subtarget &Subtarget) {
19756   MVT VT = Op.getSimpleValueType();
19757   SDLoc dl(Op);
19758   SDValue R = Op.getOperand(0);
19759   SDValue Amt = Op.getOperand(1);
19760 
19761   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19762     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19763 
19764   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
19765     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
19766     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
19767     SDValue Ex = DAG.getBitcast(ExVT, R);
19768 
19769     if (ShiftAmt >= 32) {
19770       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
19771       SDValue Upper =
19772           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
19773       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19774                                                  ShiftAmt - 32, DAG);
19775       if (VT == MVT::v2i64)
19776         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
19777       if (VT == MVT::v4i64)
19778         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19779                                   {9, 1, 11, 3, 13, 5, 15, 7});
19780     } else {
19781       // SRA upper i32, SHL whole i64 and select lower i32.
19782       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19783                                                  ShiftAmt, DAG);
19784       SDValue Lower =
19785           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
19786       Lower = DAG.getBitcast(ExVT, Lower);
19787       if (VT == MVT::v2i64)
19788         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
19789       if (VT == MVT::v4i64)
19790         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19791                                   {8, 1, 10, 3, 12, 5, 14, 7});
19792     }
19793     return DAG.getBitcast(VT, Ex);
19794   };
19795 
19796   // Optimize shl/srl/sra with constant shift amount.
19797   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
19798     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
19799       uint64_t ShiftAmt = ShiftConst->getZExtValue();
19800 
19801       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19802         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19803 
19804       // i64 SRA needs to be performed as partial shifts.
19805       if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
19806           Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
19807         return ArithmeticShiftRight64(ShiftAmt);
19808 
19809       if (VT == MVT::v16i8 ||
19810           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
19811           VT == MVT::v64i8) {
19812         unsigned NumElts = VT.getVectorNumElements();
19813         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
19814 
19815         // Simple i8 add case
19816         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
19817           return DAG.getNode(ISD::ADD, dl, VT, R, R);
19818 
19819         // ashr(R, 7)  === cmp_slt(R, 0)
19820         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
19821           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
19822           if (VT.is512BitVector()) {
19823             assert(VT == MVT::v64i8 && "Unexpected element type!");
19824             SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
19825             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
19826           }
19827           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
19828         }
19829 
19830         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
19831         if (VT == MVT::v16i8 && Subtarget.hasXOP())
19832           return SDValue();
19833 
19834         if (Op.getOpcode() == ISD::SHL) {
19835           // Make a large shift.
19836           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
19837                                                    R, ShiftAmt, DAG);
19838           SHL = DAG.getBitcast(VT, SHL);
19839           // Zero out the rightmost bits.
19840           return DAG.getNode(ISD::AND, dl, VT, SHL,
19841                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
19842         }
19843         if (Op.getOpcode() == ISD::SRL) {
19844           // Make a large shift.
19845           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
19846                                                    R, ShiftAmt, DAG);
19847           SRL = DAG.getBitcast(VT, SRL);
19848           // Zero out the leftmost bits.
19849           return DAG.getNode(ISD::AND, dl, VT, SRL,
19850                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
19851         }
19852         if (Op.getOpcode() == ISD::SRA) {
19853           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
19854           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
19855 
19856           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
19857           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
19858           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
19859           return Res;
19860         }
19861         llvm_unreachable("Unknown shift opcode.");
19862       }
19863     }
19864   }
19865 
19866   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19867   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
19868       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
19869 
19870     // Peek through any splat that was introduced for i64 shift vectorization.
19871     int SplatIndex = -1;
19872     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
19873       if (SVN->isSplat()) {
19874         SplatIndex = SVN->getSplatIndex();
19875         Amt = Amt.getOperand(0);
19876         assert(SplatIndex < (int)VT.getVectorNumElements() &&
19877                "Splat shuffle referencing second operand");
19878       }
19879 
19880     if (Amt.getOpcode() != ISD::BITCAST ||
19881         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
19882       return SDValue();
19883 
19884     Amt = Amt.getOperand(0);
19885     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19886                      VT.getVectorNumElements();
19887     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
19888     uint64_t ShiftAmt = 0;
19889     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
19890     for (unsigned i = 0; i != Ratio; ++i) {
19891       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
19892       if (!C)
19893         return SDValue();
19894       // 6 == Log2(64)
19895       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
19896     }
19897 
19898     // Check remaining shift amounts (if not a splat).
19899     if (SplatIndex < 0) {
19900       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19901         uint64_t ShAmt = 0;
19902         for (unsigned j = 0; j != Ratio; ++j) {
19903           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
19904           if (!C)
19905             return SDValue();
19906           // 6 == Log2(64)
19907           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
19908         }
19909         if (ShAmt != ShiftAmt)
19910           return SDValue();
19911       }
19912     }
19913 
19914     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19915       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19916 
19917     if (Op.getOpcode() == ISD::SRA)
19918       return ArithmeticShiftRight64(ShiftAmt);
19919   }
19920 
19921   return SDValue();
19922 }
19923 
LowerScalarVariableShift(SDValue Op,SelectionDAG & DAG,const X86Subtarget & Subtarget)19924 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
19925                                         const X86Subtarget &Subtarget) {
19926   MVT VT = Op.getSimpleValueType();
19927   SDLoc dl(Op);
19928   SDValue R = Op.getOperand(0);
19929   SDValue Amt = Op.getOperand(1);
19930 
19931   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19932     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19933 
19934   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
19935     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
19936 
19937   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
19938     SDValue BaseShAmt;
19939     MVT EltVT = VT.getVectorElementType();
19940 
19941     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
19942       // Check if this build_vector node is doing a splat.
19943       // If so, then set BaseShAmt equal to the splat value.
19944       BaseShAmt = BV->getSplatValue();
19945       if (BaseShAmt && BaseShAmt.isUndef())
19946         BaseShAmt = SDValue();
19947     } else {
19948       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
19949         Amt = Amt.getOperand(0);
19950 
19951       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
19952       if (SVN && SVN->isSplat()) {
19953         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
19954         SDValue InVec = Amt.getOperand(0);
19955         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
19956           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
19957                  "Unexpected shuffle index found!");
19958           BaseShAmt = InVec.getOperand(SplatIdx);
19959         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
19960            if (ConstantSDNode *C =
19961                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
19962              if (C->getZExtValue() == SplatIdx)
19963                BaseShAmt = InVec.getOperand(1);
19964            }
19965         }
19966 
19967         if (!BaseShAmt)
19968           // Avoid introducing an extract element from a shuffle.
19969           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
19970                                   DAG.getIntPtrConstant(SplatIdx, dl));
19971       }
19972     }
19973 
19974     if (BaseShAmt.getNode()) {
19975       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
19976       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
19977         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
19978       else if (EltVT.bitsLT(MVT::i32))
19979         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
19980 
19981       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
19982     }
19983   }
19984 
19985   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19986   if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
19987       Amt.getOpcode() == ISD::BITCAST &&
19988       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19989     Amt = Amt.getOperand(0);
19990     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19991                      VT.getVectorNumElements();
19992     std::vector<SDValue> Vals(Ratio);
19993     for (unsigned i = 0; i != Ratio; ++i)
19994       Vals[i] = Amt.getOperand(i);
19995     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19996       for (unsigned j = 0; j != Ratio; ++j)
19997         if (Vals[j] != Amt.getOperand(i + j))
19998           return SDValue();
19999     }
20000 
20001     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
20002       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
20003   }
20004   return SDValue();
20005 }
20006 
LowerShift(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20007 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
20008                           SelectionDAG &DAG) {
20009   MVT VT = Op.getSimpleValueType();
20010   SDLoc dl(Op);
20011   SDValue R = Op.getOperand(0);
20012   SDValue Amt = Op.getOperand(1);
20013   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20014 
20015   assert(VT.isVector() && "Custom lowering only for vector shifts!");
20016   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
20017 
20018   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
20019     return V;
20020 
20021   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
20022     return V;
20023 
20024   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
20025     return Op;
20026 
20027   // XOP has 128-bit variable logical/arithmetic shifts.
20028   // +ve/-ve Amt = shift left/right.
20029   if (Subtarget.hasXOP() &&
20030       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
20031        VT == MVT::v8i16 || VT == MVT::v16i8)) {
20032     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
20033       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20034       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
20035     }
20036     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
20037       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
20038     if (Op.getOpcode() == ISD::SRA)
20039       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
20040   }
20041 
20042   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
20043   // shifts per-lane and then shuffle the partial results back together.
20044   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
20045     // Splat the shift amounts so the scalar shifts above will catch it.
20046     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
20047     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
20048     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
20049     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
20050     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
20051   }
20052 
20053   // i64 vector arithmetic shift can be emulated with the transform:
20054   // M = lshr(SIGN_BIT, Amt)
20055   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
20056   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
20057       Op.getOpcode() == ISD::SRA) {
20058     SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
20059     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
20060     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
20061     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
20062     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
20063     return R;
20064   }
20065 
20066   // If possible, lower this packed shift into a vector multiply instead of
20067   // expanding it into a sequence of scalar shifts.
20068   // Do this only if the vector shift count is a constant build_vector.
20069   if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
20070       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
20071        (Subtarget.hasInt256() && VT == MVT::v16i16))) {
20072     SmallVector<SDValue, 8> Elts;
20073     MVT SVT = VT.getVectorElementType();
20074     unsigned SVTBits = SVT.getSizeInBits();
20075     APInt One(SVTBits, 1);
20076     unsigned NumElems = VT.getVectorNumElements();
20077 
20078     for (unsigned i=0; i !=NumElems; ++i) {
20079       SDValue Op = Amt->getOperand(i);
20080       if (Op->isUndef()) {
20081         Elts.push_back(Op);
20082         continue;
20083       }
20084 
20085       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
20086       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
20087       uint64_t ShAmt = C.getZExtValue();
20088       if (ShAmt >= SVTBits) {
20089         Elts.push_back(DAG.getUNDEF(SVT));
20090         continue;
20091       }
20092       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
20093     }
20094     SDValue BV = DAG.getBuildVector(VT, dl, Elts);
20095     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
20096   }
20097 
20098   // Lower SHL with variable shift amount.
20099   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
20100     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
20101 
20102     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
20103                      DAG.getConstant(0x3f800000U, dl, VT));
20104     Op = DAG.getBitcast(MVT::v4f32, Op);
20105     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
20106     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
20107   }
20108 
20109   // If possible, lower this shift as a sequence of two shifts by
20110   // constant plus a MOVSS/MOVSD instead of scalarizing it.
20111   // Example:
20112   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
20113   //
20114   // Could be rewritten as:
20115   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
20116   //
20117   // The advantage is that the two shifts from the example would be
20118   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
20119   // the vector shift into four scalar shifts plus four pairs of vector
20120   // insert/extract.
20121   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
20122     unsigned TargetOpcode = X86ISD::MOVSS;
20123     bool CanBeSimplified;
20124     // The splat value for the first packed shift (the 'X' from the example).
20125     SDValue Amt1 = Amt->getOperand(0);
20126     // The splat value for the second packed shift (the 'Y' from the example).
20127     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
20128 
20129     // See if it is possible to replace this node with a sequence of
20130     // two shifts followed by a MOVSS/MOVSD
20131     if (VT == MVT::v4i32) {
20132       // Check if it is legal to use a MOVSS.
20133       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
20134                         Amt2 == Amt->getOperand(3);
20135       if (!CanBeSimplified) {
20136         // Otherwise, check if we can still simplify this node using a MOVSD.
20137         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
20138                           Amt->getOperand(2) == Amt->getOperand(3);
20139         TargetOpcode = X86ISD::MOVSD;
20140         Amt2 = Amt->getOperand(2);
20141       }
20142     } else {
20143       // Do similar checks for the case where the machine value type
20144       // is MVT::v8i16.
20145       CanBeSimplified = Amt1 == Amt->getOperand(1);
20146       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
20147         CanBeSimplified = Amt2 == Amt->getOperand(i);
20148 
20149       if (!CanBeSimplified) {
20150         TargetOpcode = X86ISD::MOVSD;
20151         CanBeSimplified = true;
20152         Amt2 = Amt->getOperand(4);
20153         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
20154           CanBeSimplified = Amt1 == Amt->getOperand(i);
20155         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
20156           CanBeSimplified = Amt2 == Amt->getOperand(j);
20157       }
20158     }
20159 
20160     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
20161         isa<ConstantSDNode>(Amt2)) {
20162       // Replace this node with two shifts followed by a MOVSS/MOVSD.
20163       MVT CastVT = MVT::v4i32;
20164       SDValue Splat1 =
20165         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
20166       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
20167       SDValue Splat2 =
20168         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
20169       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
20170       if (TargetOpcode == X86ISD::MOVSD)
20171         CastVT = MVT::v2i64;
20172       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
20173       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
20174       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
20175                                             BitCast1, DAG);
20176       return DAG.getBitcast(VT, Result);
20177     }
20178   }
20179 
20180   // v4i32 Non Uniform Shifts.
20181   // If the shift amount is constant we can shift each lane using the SSE2
20182   // immediate shifts, else we need to zero-extend each lane to the lower i64
20183   // and shift using the SSE2 variable shifts.
20184   // The separate results can then be blended together.
20185   if (VT == MVT::v4i32) {
20186     unsigned Opc = Op.getOpcode();
20187     SDValue Amt0, Amt1, Amt2, Amt3;
20188     if (ConstantAmt) {
20189       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
20190       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
20191       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
20192       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
20193     } else {
20194       // ISD::SHL is handled above but we include it here for completeness.
20195       switch (Opc) {
20196       default:
20197         llvm_unreachable("Unknown target vector shift node");
20198       case ISD::SHL:
20199         Opc = X86ISD::VSHL;
20200         break;
20201       case ISD::SRL:
20202         Opc = X86ISD::VSRL;
20203         break;
20204       case ISD::SRA:
20205         Opc = X86ISD::VSRA;
20206         break;
20207       }
20208       // The SSE2 shifts use the lower i64 as the same shift amount for
20209       // all lanes and the upper i64 is ignored. These shuffle masks
20210       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
20211       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20212       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
20213       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
20214       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
20215       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
20216     }
20217 
20218     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
20219     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
20220     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
20221     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
20222     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
20223     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
20224     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
20225   }
20226 
20227   if (VT == MVT::v16i8 ||
20228       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
20229     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
20230     unsigned ShiftOpcode = Op->getOpcode();
20231 
20232     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
20233       // On SSE41 targets we make use of the fact that VSELECT lowers
20234       // to PBLENDVB which selects bytes based just on the sign bit.
20235       if (Subtarget.hasSSE41()) {
20236         V0 = DAG.getBitcast(VT, V0);
20237         V1 = DAG.getBitcast(VT, V1);
20238         Sel = DAG.getBitcast(VT, Sel);
20239         return DAG.getBitcast(SelVT,
20240                               DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
20241       }
20242       // On pre-SSE41 targets we test for the sign bit by comparing to
20243       // zero - a negative value will set all bits of the lanes to true
20244       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
20245       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
20246       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
20247       return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
20248     };
20249 
20250     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
20251     // We can safely do this using i16 shifts as we're only interested in
20252     // the 3 lower bits of each byte.
20253     Amt = DAG.getBitcast(ExtVT, Amt);
20254     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
20255     Amt = DAG.getBitcast(VT, Amt);
20256 
20257     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
20258       // r = VSELECT(r, shift(r, 4), a);
20259       SDValue M =
20260           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20261       R = SignBitSelect(VT, Amt, M, R);
20262 
20263       // a += a
20264       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20265 
20266       // r = VSELECT(r, shift(r, 2), a);
20267       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20268       R = SignBitSelect(VT, Amt, M, R);
20269 
20270       // a += a
20271       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20272 
20273       // return VSELECT(r, shift(r, 1), a);
20274       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20275       R = SignBitSelect(VT, Amt, M, R);
20276       return R;
20277     }
20278 
20279     if (Op->getOpcode() == ISD::SRA) {
20280       // For SRA we need to unpack each byte to the higher byte of a i16 vector
20281       // so we can correctly sign extend. We don't care what happens to the
20282       // lower byte.
20283       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
20284       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
20285       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
20286       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
20287       ALo = DAG.getBitcast(ExtVT, ALo);
20288       AHi = DAG.getBitcast(ExtVT, AHi);
20289       RLo = DAG.getBitcast(ExtVT, RLo);
20290       RHi = DAG.getBitcast(ExtVT, RHi);
20291 
20292       // r = VSELECT(r, shift(r, 4), a);
20293       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20294                                 DAG.getConstant(4, dl, ExtVT));
20295       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20296                                 DAG.getConstant(4, dl, ExtVT));
20297       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20298       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20299 
20300       // a += a
20301       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20302       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20303 
20304       // r = VSELECT(r, shift(r, 2), a);
20305       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20306                         DAG.getConstant(2, dl, ExtVT));
20307       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20308                         DAG.getConstant(2, dl, ExtVT));
20309       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20310       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20311 
20312       // a += a
20313       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20314       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20315 
20316       // r = VSELECT(r, shift(r, 1), a);
20317       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20318                         DAG.getConstant(1, dl, ExtVT));
20319       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20320                         DAG.getConstant(1, dl, ExtVT));
20321       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20322       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20323 
20324       // Logical shift the result back to the lower byte, leaving a zero upper
20325       // byte
20326       // meaning that we can safely pack with PACKUSWB.
20327       RLo =
20328           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
20329       RHi =
20330           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
20331       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20332     }
20333   }
20334 
20335   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
20336   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
20337   // solution better.
20338   if (Subtarget.hasInt256() && VT == MVT::v8i16) {
20339     MVT ExtVT = MVT::v8i32;
20340     unsigned ExtOpc =
20341         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20342     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
20343     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
20344     return DAG.getNode(ISD::TRUNCATE, dl, VT,
20345                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
20346   }
20347 
20348   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
20349     MVT ExtVT = MVT::v8i32;
20350     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20351     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
20352     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
20353     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
20354     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
20355     ALo = DAG.getBitcast(ExtVT, ALo);
20356     AHi = DAG.getBitcast(ExtVT, AHi);
20357     RLo = DAG.getBitcast(ExtVT, RLo);
20358     RHi = DAG.getBitcast(ExtVT, RHi);
20359     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
20360     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
20361     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
20362     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
20363     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20364   }
20365 
20366   if (VT == MVT::v8i16) {
20367     unsigned ShiftOpcode = Op->getOpcode();
20368 
20369     // If we have a constant shift amount, the non-SSE41 path is best as
20370     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
20371     bool UseSSE41 = Subtarget.hasSSE41() &&
20372                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20373 
20374     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
20375       // On SSE41 targets we make use of the fact that VSELECT lowers
20376       // to PBLENDVB which selects bytes based just on the sign bit.
20377       if (UseSSE41) {
20378         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
20379         V0 = DAG.getBitcast(ExtVT, V0);
20380         V1 = DAG.getBitcast(ExtVT, V1);
20381         Sel = DAG.getBitcast(ExtVT, Sel);
20382         return DAG.getBitcast(
20383             VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
20384       }
20385       // On pre-SSE41 targets we splat the sign bit - a negative value will
20386       // set all bits of the lanes to true and VSELECT uses that in
20387       // its OR(AND(V0,C),AND(V1,~C)) lowering.
20388       SDValue C =
20389           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
20390       return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
20391     };
20392 
20393     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
20394     if (UseSSE41) {
20395       // On SSE41 targets we need to replicate the shift mask in both
20396       // bytes for PBLENDVB.
20397       Amt = DAG.getNode(
20398           ISD::OR, dl, VT,
20399           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
20400           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
20401     } else {
20402       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
20403     }
20404 
20405     // r = VSELECT(r, shift(r, 8), a);
20406     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
20407     R = SignBitSelect(Amt, M, R);
20408 
20409     // a += a
20410     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20411 
20412     // r = VSELECT(r, shift(r, 4), a);
20413     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20414     R = SignBitSelect(Amt, M, R);
20415 
20416     // a += a
20417     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20418 
20419     // r = VSELECT(r, shift(r, 2), a);
20420     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20421     R = SignBitSelect(Amt, M, R);
20422 
20423     // a += a
20424     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20425 
20426     // return VSELECT(r, shift(r, 1), a);
20427     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20428     R = SignBitSelect(Amt, M, R);
20429     return R;
20430   }
20431 
20432   // Decompose 256-bit shifts into smaller 128-bit shifts.
20433   if (VT.is256BitVector())
20434     return Lower256IntArith(Op, DAG);
20435 
20436   return SDValue();
20437 }
20438 
LowerRotate(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20439 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
20440                            SelectionDAG &DAG) {
20441   MVT VT = Op.getSimpleValueType();
20442   SDLoc DL(Op);
20443   SDValue R = Op.getOperand(0);
20444   SDValue Amt = Op.getOperand(1);
20445 
20446   assert(VT.isVector() && "Custom lowering only for vector rotates!");
20447   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
20448   assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
20449 
20450   // XOP has 128-bit vector variable + immediate rotates.
20451   // +ve/-ve Amt = rotate left/right.
20452 
20453   // Split 256-bit integers.
20454   if (VT.is256BitVector())
20455     return Lower256IntArith(Op, DAG);
20456 
20457   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
20458 
20459   // Attempt to rotate by immediate.
20460   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
20461     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
20462       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
20463       assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
20464       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
20465                          DAG.getConstant(RotateAmt, DL, MVT::i8));
20466     }
20467   }
20468 
20469   // Use general rotate by variable (per-element).
20470   return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
20471 }
20472 
LowerXALUO(SDValue Op,SelectionDAG & DAG)20473 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
20474   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
20475   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
20476   // looks for this combo and may remove the "setcc" instruction if the "setcc"
20477   // has only one use.
20478   SDNode *N = Op.getNode();
20479   SDValue LHS = N->getOperand(0);
20480   SDValue RHS = N->getOperand(1);
20481   unsigned BaseOp = 0;
20482   unsigned Cond = 0;
20483   SDLoc DL(Op);
20484   switch (Op.getOpcode()) {
20485   default: llvm_unreachable("Unknown ovf instruction!");
20486   case ISD::SADDO:
20487     // A subtract of one will be selected as a INC. Note that INC doesn't
20488     // set CF, so we can't do this for UADDO.
20489     if (isOneConstant(RHS)) {
20490         BaseOp = X86ISD::INC;
20491         Cond = X86::COND_O;
20492         break;
20493       }
20494     BaseOp = X86ISD::ADD;
20495     Cond = X86::COND_O;
20496     break;
20497   case ISD::UADDO:
20498     BaseOp = X86ISD::ADD;
20499     Cond = X86::COND_B;
20500     break;
20501   case ISD::SSUBO:
20502     // A subtract of one will be selected as a DEC. Note that DEC doesn't
20503     // set CF, so we can't do this for USUBO.
20504     if (isOneConstant(RHS)) {
20505         BaseOp = X86ISD::DEC;
20506         Cond = X86::COND_O;
20507         break;
20508       }
20509     BaseOp = X86ISD::SUB;
20510     Cond = X86::COND_O;
20511     break;
20512   case ISD::USUBO:
20513     BaseOp = X86ISD::SUB;
20514     Cond = X86::COND_B;
20515     break;
20516   case ISD::SMULO:
20517     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
20518     Cond = X86::COND_O;
20519     break;
20520   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
20521     if (N->getValueType(0) == MVT::i8) {
20522       BaseOp = X86ISD::UMUL8;
20523       Cond = X86::COND_O;
20524       break;
20525     }
20526     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
20527                                  MVT::i32);
20528     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
20529 
20530     SDValue SetCC =
20531       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20532                   DAG.getConstant(X86::COND_O, DL, MVT::i32),
20533                   SDValue(Sum.getNode(), 2));
20534 
20535     if (N->getValueType(1) == MVT::i1) {
20536       SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20537                           DAG.getValueType(MVT::i1));
20538       SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20539     }
20540     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20541   }
20542   }
20543 
20544   // Also sets EFLAGS.
20545   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
20546   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
20547 
20548   SDValue SetCC =
20549     DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20550                 DAG.getConstant(Cond, DL, MVT::i32),
20551                 SDValue(Sum.getNode(), 1));
20552 
20553   if (N->getValueType(1) == MVT::i1) {
20554     SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20555                         DAG.getValueType(MVT::i1));
20556     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20557   }
20558   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20559 }
20560 
20561 /// Returns true if the operand type is exactly twice the native width, and
20562 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
20563 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
20564 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
needsCmpXchgNb(Type * MemType) const20565 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
20566   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
20567 
20568   if (OpWidth == 64)
20569     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
20570   else if (OpWidth == 128)
20571     return Subtarget.hasCmpxchg16b();
20572   else
20573     return false;
20574 }
20575 
shouldExpandAtomicStoreInIR(StoreInst * SI) const20576 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20577   return needsCmpXchgNb(SI->getValueOperand()->getType());
20578 }
20579 
20580 // Note: this turns large loads into lock cmpxchg8b/16b.
20581 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
20582 TargetLowering::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst * LI) const20583 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20584   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
20585   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
20586                                                : AtomicExpansionKind::None;
20587 }
20588 
20589 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * AI) const20590 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
20591   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20592   Type *MemType = AI->getType();
20593 
20594   // If the operand is too big, we must see if cmpxchg8/16b is available
20595   // and default to library calls otherwise.
20596   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
20597     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
20598                                    : AtomicExpansionKind::None;
20599   }
20600 
20601   AtomicRMWInst::BinOp Op = AI->getOperation();
20602   switch (Op) {
20603   default:
20604     llvm_unreachable("Unknown atomic operation");
20605   case AtomicRMWInst::Xchg:
20606   case AtomicRMWInst::Add:
20607   case AtomicRMWInst::Sub:
20608     // It's better to use xadd, xsub or xchg for these in all cases.
20609     return AtomicExpansionKind::None;
20610   case AtomicRMWInst::Or:
20611   case AtomicRMWInst::And:
20612   case AtomicRMWInst::Xor:
20613     // If the atomicrmw's result isn't actually used, we can just add a "lock"
20614     // prefix to a normal instruction for these operations.
20615     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
20616                             : AtomicExpansionKind::None;
20617   case AtomicRMWInst::Nand:
20618   case AtomicRMWInst::Max:
20619   case AtomicRMWInst::Min:
20620   case AtomicRMWInst::UMax:
20621   case AtomicRMWInst::UMin:
20622     // These always require a non-trivial set of data operations on x86. We must
20623     // use a cmpxchg loop.
20624     return AtomicExpansionKind::CmpXChg;
20625   }
20626 }
20627 
20628 LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst * AI) const20629 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
20630   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20631   Type *MemType = AI->getType();
20632   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
20633   // there is no benefit in turning such RMWs into loads, and it is actually
20634   // harmful as it introduces a mfence.
20635   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
20636     return nullptr;
20637 
20638   auto Builder = IRBuilder<>(AI);
20639   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20640   auto SynchScope = AI->getSynchScope();
20641   // We must restrict the ordering to avoid generating loads with Release or
20642   // ReleaseAcquire orderings.
20643   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
20644   auto Ptr = AI->getPointerOperand();
20645 
20646   // Before the load we need a fence. Here is an example lifted from
20647   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
20648   // is required:
20649   // Thread 0:
20650   //   x.store(1, relaxed);
20651   //   r1 = y.fetch_add(0, release);
20652   // Thread 1:
20653   //   y.fetch_add(42, acquire);
20654   //   r2 = x.load(relaxed);
20655   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
20656   // lowered to just a load without a fence. A mfence flushes the store buffer,
20657   // making the optimization clearly correct.
20658   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
20659   // otherwise, we might be able to be more aggressive on relaxed idempotent
20660   // rmw. In practice, they do not look useful, so we don't try to be
20661   // especially clever.
20662   if (SynchScope == SingleThread)
20663     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
20664     // the IR level, so we must wrap it in an intrinsic.
20665     return nullptr;
20666 
20667   if (!Subtarget.hasMFence())
20668     // FIXME: it might make sense to use a locked operation here but on a
20669     // different cache-line to prevent cache-line bouncing. In practice it
20670     // is probably a small win, and x86 processors without mfence are rare
20671     // enough that we do not bother.
20672     return nullptr;
20673 
20674   Function *MFence =
20675       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
20676   Builder.CreateCall(MFence, {});
20677 
20678   // Finally we can emit the atomic load.
20679   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
20680           AI->getType()->getPrimitiveSizeInBits());
20681   Loaded->setAtomic(Order, SynchScope);
20682   AI->replaceAllUsesWith(Loaded);
20683   AI->eraseFromParent();
20684   return Loaded;
20685 }
20686 
LowerATOMIC_FENCE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20687 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
20688                                  SelectionDAG &DAG) {
20689   SDLoc dl(Op);
20690   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
20691     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
20692   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
20693     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
20694 
20695   // The only fence that needs an instruction is a sequentially-consistent
20696   // cross-thread fence.
20697   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
20698       FenceScope == CrossThread) {
20699     if (Subtarget.hasMFence())
20700       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
20701 
20702     SDValue Chain = Op.getOperand(0);
20703     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
20704     SDValue Ops[] = {
20705       DAG.getRegister(X86::ESP, MVT::i32),     // Base
20706       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
20707       DAG.getRegister(0, MVT::i32),            // Index
20708       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
20709       DAG.getRegister(0, MVT::i32),            // Segment.
20710       Zero,
20711       Chain
20712     };
20713     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
20714     return SDValue(Res, 0);
20715   }
20716 
20717   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
20718   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
20719 }
20720 
LowerCMP_SWAP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20721 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
20722                              SelectionDAG &DAG) {
20723   MVT T = Op.getSimpleValueType();
20724   SDLoc DL(Op);
20725   unsigned Reg = 0;
20726   unsigned size = 0;
20727   switch(T.SimpleTy) {
20728   default: llvm_unreachable("Invalid value type!");
20729   case MVT::i8:  Reg = X86::AL;  size = 1; break;
20730   case MVT::i16: Reg = X86::AX;  size = 2; break;
20731   case MVT::i32: Reg = X86::EAX; size = 4; break;
20732   case MVT::i64:
20733     assert(Subtarget.is64Bit() && "Node not type legal!");
20734     Reg = X86::RAX; size = 8;
20735     break;
20736   }
20737   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
20738                                   Op.getOperand(2), SDValue());
20739   SDValue Ops[] = { cpIn.getValue(0),
20740                     Op.getOperand(1),
20741                     Op.getOperand(3),
20742                     DAG.getTargetConstant(size, DL, MVT::i8),
20743                     cpIn.getValue(1) };
20744   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20745   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
20746   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
20747                                            Ops, T, MMO);
20748 
20749   SDValue cpOut =
20750     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
20751   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
20752                                       MVT::i32, cpOut.getValue(2));
20753   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
20754                                 DAG.getConstant(X86::COND_E, DL, MVT::i8),
20755                                 EFLAGS);
20756 
20757   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
20758   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
20759   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
20760   return SDValue();
20761 }
20762 
LowerBITCAST(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)20763 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
20764                             SelectionDAG &DAG) {
20765   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
20766   MVT DstVT = Op.getSimpleValueType();
20767 
20768   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
20769       SrcVT == MVT::i64) {
20770     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
20771     if (DstVT != MVT::f64)
20772       // This conversion needs to be expanded.
20773       return SDValue();
20774 
20775     SDValue Op0 = Op->getOperand(0);
20776     SmallVector<SDValue, 16> Elts;
20777     SDLoc dl(Op);
20778     unsigned NumElts;
20779     MVT SVT;
20780     if (SrcVT.isVector()) {
20781       NumElts = SrcVT.getVectorNumElements();
20782       SVT = SrcVT.getVectorElementType();
20783 
20784       // Widen the vector in input in the case of MVT::v2i32.
20785       // Example: from MVT::v2i32 to MVT::v4i32.
20786       for (unsigned i = 0, e = NumElts; i != e; ++i)
20787         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
20788                                    DAG.getIntPtrConstant(i, dl)));
20789     } else {
20790       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
20791              "Unexpected source type in LowerBITCAST");
20792       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20793                                  DAG.getIntPtrConstant(0, dl)));
20794       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20795                                  DAG.getIntPtrConstant(1, dl)));
20796       NumElts = 2;
20797       SVT = MVT::i32;
20798     }
20799     // Explicitly mark the extra elements as Undef.
20800     Elts.append(NumElts, DAG.getUNDEF(SVT));
20801 
20802     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20803     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
20804     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
20805     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
20806                        DAG.getIntPtrConstant(0, dl));
20807   }
20808 
20809   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
20810          Subtarget.hasMMX() && "Unexpected custom BITCAST");
20811   assert((DstVT == MVT::i64 ||
20812           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
20813          "Unexpected custom BITCAST");
20814   // i64 <=> MMX conversions are Legal.
20815   if (SrcVT==MVT::i64 && DstVT.isVector())
20816     return Op;
20817   if (DstVT==MVT::i64 && SrcVT.isVector())
20818     return Op;
20819   // MMX <=> MMX conversions are Legal.
20820   if (SrcVT.isVector() && DstVT.isVector())
20821     return Op;
20822   // All other conversions need to be expanded.
20823   return SDValue();
20824 }
20825 
20826 /// Compute the horizontal sum of bytes in V for the elements of VT.
20827 ///
20828 /// Requires V to be a byte vector and VT to be an integer vector type with
20829 /// wider elements than V's type. The width of the elements of VT determines
20830 /// how many bytes of V are summed horizontally to produce each element of the
20831 /// result.
LowerHorizontalByteSum(SDValue V,MVT VT,const X86Subtarget & Subtarget,SelectionDAG & DAG)20832 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
20833                                       const X86Subtarget &Subtarget,
20834                                       SelectionDAG &DAG) {
20835   SDLoc DL(V);
20836   MVT ByteVecVT = V.getSimpleValueType();
20837   MVT EltVT = VT.getVectorElementType();
20838   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
20839          "Expected value to have byte element type.");
20840   assert(EltVT != MVT::i8 &&
20841          "Horizontal byte sum only makes sense for wider elements!");
20842   unsigned VecSize = VT.getSizeInBits();
20843   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
20844 
20845   // PSADBW instruction horizontally add all bytes and leave the result in i64
20846   // chunks, thus directly computes the pop count for v2i64 and v4i64.
20847   if (EltVT == MVT::i64) {
20848     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20849     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20850     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
20851     return DAG.getBitcast(VT, V);
20852   }
20853 
20854   if (EltVT == MVT::i32) {
20855     // We unpack the low half and high half into i32s interleaved with zeros so
20856     // that we can use PSADBW to horizontally sum them. The most useful part of
20857     // this is that it lines up the results of two PSADBW instructions to be
20858     // two v2i64 vectors which concatenated are the 4 population counts. We can
20859     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
20860     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
20861     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
20862     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
20863 
20864     // Do the horizontal sums into two v2i64s.
20865     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20866     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20867     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20868                       DAG.getBitcast(ByteVecVT, Low), Zeros);
20869     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20870                        DAG.getBitcast(ByteVecVT, High), Zeros);
20871 
20872     // Merge them together.
20873     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
20874     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
20875                     DAG.getBitcast(ShortVecVT, Low),
20876                     DAG.getBitcast(ShortVecVT, High));
20877 
20878     return DAG.getBitcast(VT, V);
20879   }
20880 
20881   // The only element type left is i16.
20882   assert(EltVT == MVT::i16 && "Unknown how to handle type");
20883 
20884   // To obtain pop count for each i16 element starting from the pop count for
20885   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
20886   // right by 8. It is important to shift as i16s as i8 vector shift isn't
20887   // directly supported.
20888   SDValue ShifterV = DAG.getConstant(8, DL, VT);
20889   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20890   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
20891                   DAG.getBitcast(ByteVecVT, V));
20892   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20893 }
20894 
LowerVectorCTPOPInRegLUT(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)20895 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
20896                                         const X86Subtarget &Subtarget,
20897                                         SelectionDAG &DAG) {
20898   MVT VT = Op.getSimpleValueType();
20899   MVT EltVT = VT.getVectorElementType();
20900   unsigned VecSize = VT.getSizeInBits();
20901 
20902   // Implement a lookup table in register by using an algorithm based on:
20903   // http://wm.ite.pl/articles/sse-popcount.html
20904   //
20905   // The general idea is that every lower byte nibble in the input vector is an
20906   // index into a in-register pre-computed pop count table. We then split up the
20907   // input vector in two new ones: (1) a vector with only the shifted-right
20908   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
20909   // masked out higher ones) for each byte. PSHUB is used separately with both
20910   // to index the in-register table. Next, both are added and the result is a
20911   // i8 vector where each element contains the pop count for input byte.
20912   //
20913   // To obtain the pop count for elements != i8, we follow up with the same
20914   // approach and use additional tricks as described below.
20915   //
20916   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
20917                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
20918                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
20919                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
20920 
20921   int NumByteElts = VecSize / 8;
20922   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
20923   SDValue In = DAG.getBitcast(ByteVecVT, Op);
20924   SmallVector<SDValue, 64> LUTVec;
20925   for (int i = 0; i < NumByteElts; ++i)
20926     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20927   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
20928   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
20929 
20930   // High nibbles
20931   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
20932   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
20933 
20934   // Low nibbles
20935   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
20936 
20937   // The input vector is used as the shuffle mask that index elements into the
20938   // LUT. After counting low and high nibbles, add the vector to obtain the
20939   // final pop count per i8 element.
20940   SDValue HighPopCnt =
20941       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
20942   SDValue LowPopCnt =
20943       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
20944   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
20945 
20946   if (EltVT == MVT::i8)
20947     return PopCnt;
20948 
20949   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
20950 }
20951 
LowerVectorCTPOPBitmath(SDValue Op,const SDLoc & DL,const X86Subtarget & Subtarget,SelectionDAG & DAG)20952 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
20953                                        const X86Subtarget &Subtarget,
20954                                        SelectionDAG &DAG) {
20955   MVT VT = Op.getSimpleValueType();
20956   assert(VT.is128BitVector() &&
20957          "Only 128-bit vector bitmath lowering supported.");
20958 
20959   int VecSize = VT.getSizeInBits();
20960   MVT EltVT = VT.getVectorElementType();
20961   int Len = EltVT.getSizeInBits();
20962 
20963   // This is the vectorized version of the "best" algorithm from
20964   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
20965   // with a minor tweak to use a series of adds + shifts instead of vector
20966   // multiplications. Implemented for all integer vector types. We only use
20967   // this when we don't have SSSE3 which allows a LUT-based lowering that is
20968   // much faster, even faster than using native popcnt instructions.
20969 
20970   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
20971     MVT VT = V.getSimpleValueType();
20972     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
20973     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
20974   };
20975   auto GetMask = [&](SDValue V, APInt Mask) {
20976     MVT VT = V.getSimpleValueType();
20977     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
20978     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
20979   };
20980 
20981   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
20982   // x86, so set the SRL type to have elements at least i16 wide. This is
20983   // correct because all of our SRLs are followed immediately by a mask anyways
20984   // that handles any bits that sneak into the high bits of the byte elements.
20985   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
20986 
20987   SDValue V = Op;
20988 
20989   // v = v - ((v >> 1) & 0x55555555...)
20990   SDValue Srl =
20991       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
20992   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
20993   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
20994 
20995   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
20996   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
20997   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
20998   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
20999   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
21000 
21001   // v = (v + (v >> 4)) & 0x0F0F0F0F...
21002   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
21003   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
21004   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
21005 
21006   // At this point, V contains the byte-wise population count, and we are
21007   // merely doing a horizontal sum if necessary to get the wider element
21008   // counts.
21009   if (EltVT == MVT::i8)
21010     return V;
21011 
21012   return LowerHorizontalByteSum(
21013       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
21014       DAG);
21015 }
21016 
LowerVectorCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21017 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21018                                 SelectionDAG &DAG) {
21019   MVT VT = Op.getSimpleValueType();
21020   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
21021          "Unknown CTPOP type to handle");
21022   SDLoc DL(Op.getNode());
21023   SDValue Op0 = Op.getOperand(0);
21024 
21025   if (!Subtarget.hasSSSE3()) {
21026     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
21027     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
21028     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
21029   }
21030 
21031   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21032     unsigned NumElems = VT.getVectorNumElements();
21033 
21034     // Extract each 128-bit vector, compute pop count and concat the result.
21035     SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
21036     SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
21037 
21038     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21039                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21040                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21041   }
21042 
21043   if (VT.is512BitVector() && !Subtarget.hasBWI()) {
21044     unsigned NumElems = VT.getVectorNumElements();
21045 
21046     // Extract each 256-bit vector, compute pop count and concat the result.
21047     SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
21048     SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
21049 
21050     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21051                        LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21052                        LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21053   }
21054 
21055   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
21056 }
21057 
LowerCTPOP(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21058 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21059                           SelectionDAG &DAG) {
21060   assert(Op.getSimpleValueType().isVector() &&
21061          "We only do custom lowering for vector population count.");
21062   return LowerVectorCTPOP(Op, Subtarget, DAG);
21063 }
21064 
LowerBITREVERSE_XOP(SDValue Op,SelectionDAG & DAG)21065 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
21066   MVT VT = Op.getSimpleValueType();
21067   SDValue In = Op.getOperand(0);
21068   SDLoc DL(Op);
21069 
21070   // For scalars, its still beneficial to transfer to/from the SIMD unit to
21071   // perform the BITREVERSE.
21072   if (!VT.isVector()) {
21073     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
21074     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
21075     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
21076     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
21077                        DAG.getIntPtrConstant(0, DL));
21078   }
21079 
21080   MVT SVT = VT.getVectorElementType();
21081   int NumElts = VT.getVectorNumElements();
21082   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
21083 
21084   // Decompose 256-bit ops into smaller 128-bit ops.
21085   if (VT.is256BitVector()) {
21086     SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21087     SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21088 
21089     MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
21090     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21091                        DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
21092                        DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
21093   }
21094 
21095   assert(VT.is128BitVector() &&
21096          "Only 128-bit vector bitreverse lowering supported.");
21097 
21098   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
21099   // perform the BSWAP in the shuffle.
21100   // Its best to shuffle using the second operand as this will implicitly allow
21101   // memory folding for multiple vectors.
21102   SmallVector<SDValue, 16> MaskElts;
21103   for (int i = 0; i != NumElts; ++i) {
21104     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
21105       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
21106       int PermuteByte = SourceByte | (2 << 5);
21107       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
21108     }
21109   }
21110 
21111   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
21112   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
21113   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
21114                     Res, Mask);
21115   return DAG.getBitcast(VT, Res);
21116 }
21117 
LowerBITREVERSE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21118 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
21119                                SelectionDAG &DAG) {
21120   if (Subtarget.hasXOP())
21121     return LowerBITREVERSE_XOP(Op, DAG);
21122 
21123   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
21124 
21125   MVT VT = Op.getSimpleValueType();
21126   SDValue In = Op.getOperand(0);
21127   SDLoc DL(Op);
21128 
21129   unsigned NumElts = VT.getVectorNumElements();
21130   assert(VT.getScalarType() == MVT::i8 &&
21131          "Only byte vector BITREVERSE supported");
21132 
21133   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
21134   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21135     MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
21136     SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21137     SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21138     Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
21139     Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
21140     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21141   }
21142 
21143   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
21144   // two nibbles and a PSHUFB lookup to find the bitreverse of each
21145   // 0-15 value (moved to the other nibble).
21146   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
21147   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
21148   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
21149 
21150   const int LoLUT[16] = {
21151       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
21152       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
21153       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
21154       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
21155   const int HiLUT[16] = {
21156       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
21157       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
21158       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
21159       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
21160 
21161   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
21162   for (unsigned i = 0; i < NumElts; ++i) {
21163     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
21164     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
21165   }
21166 
21167   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
21168   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
21169   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
21170   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
21171   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
21172 }
21173 
lowerAtomicArithWithLOCK(SDValue N,SelectionDAG & DAG)21174 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
21175   unsigned NewOpc = 0;
21176   switch (N->getOpcode()) {
21177   case ISD::ATOMIC_LOAD_ADD:
21178     NewOpc = X86ISD::LADD;
21179     break;
21180   case ISD::ATOMIC_LOAD_SUB:
21181     NewOpc = X86ISD::LSUB;
21182     break;
21183   case ISD::ATOMIC_LOAD_OR:
21184     NewOpc = X86ISD::LOR;
21185     break;
21186   case ISD::ATOMIC_LOAD_XOR:
21187     NewOpc = X86ISD::LXOR;
21188     break;
21189   case ISD::ATOMIC_LOAD_AND:
21190     NewOpc = X86ISD::LAND;
21191     break;
21192   default:
21193     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
21194   }
21195 
21196   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
21197   return DAG.getMemIntrinsicNode(
21198       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
21199       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
21200       /*MemVT=*/N->getSimpleValueType(0), MMO);
21201 }
21202 
21203 /// Lower atomic_load_ops into LOCK-prefixed operations.
lowerAtomicArith(SDValue N,SelectionDAG & DAG,const X86Subtarget & Subtarget)21204 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
21205                                 const X86Subtarget &Subtarget) {
21206   SDValue Chain = N->getOperand(0);
21207   SDValue LHS = N->getOperand(1);
21208   SDValue RHS = N->getOperand(2);
21209   unsigned Opc = N->getOpcode();
21210   MVT VT = N->getSimpleValueType(0);
21211   SDLoc DL(N);
21212 
21213   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
21214   // can only be lowered when the result is unused.  They should have already
21215   // been transformed into a cmpxchg loop in AtomicExpand.
21216   if (N->hasAnyUseOfValue(0)) {
21217     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
21218     // select LXADD if LOCK_SUB can't be selected.
21219     if (Opc == ISD::ATOMIC_LOAD_SUB) {
21220       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
21221       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
21222       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
21223                            RHS, AN->getMemOperand(), AN->getOrdering(),
21224                            AN->getSynchScope());
21225     }
21226     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
21227            "Used AtomicRMW ops other than Add should have been expanded!");
21228     return N;
21229   }
21230 
21231   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
21232   // RAUW the chain, but don't worry about the result, as it's unused.
21233   assert(!N->hasAnyUseOfValue(0));
21234   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
21235   return SDValue();
21236 }
21237 
LowerATOMIC_STORE(SDValue Op,SelectionDAG & DAG)21238 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
21239   SDNode *Node = Op.getNode();
21240   SDLoc dl(Node);
21241   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
21242 
21243   // Convert seq_cst store -> xchg
21244   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
21245   // FIXME: On 32-bit, store -> fist or movq would be more efficient
21246   //        (The only way to get a 16-byte store is cmpxchg16b)
21247   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
21248   if (cast<AtomicSDNode>(Node)->getOrdering() ==
21249           AtomicOrdering::SequentiallyConsistent ||
21250       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
21251     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
21252                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
21253                                  Node->getOperand(0),
21254                                  Node->getOperand(1), Node->getOperand(2),
21255                                  cast<AtomicSDNode>(Node)->getMemOperand(),
21256                                  cast<AtomicSDNode>(Node)->getOrdering(),
21257                                  cast<AtomicSDNode>(Node)->getSynchScope());
21258     return Swap.getValue(1);
21259   }
21260   // Other atomic stores have a simple pattern.
21261   return Op;
21262 }
21263 
LowerADDC_ADDE_SUBC_SUBE(SDValue Op,SelectionDAG & DAG)21264 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
21265   MVT VT = Op.getNode()->getSimpleValueType(0);
21266 
21267   // Let legalize expand this if it isn't a legal type yet.
21268   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
21269     return SDValue();
21270 
21271   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21272 
21273   unsigned Opc;
21274   bool ExtraOp = false;
21275   switch (Op.getOpcode()) {
21276   default: llvm_unreachable("Invalid code");
21277   case ISD::ADDC: Opc = X86ISD::ADD; break;
21278   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
21279   case ISD::SUBC: Opc = X86ISD::SUB; break;
21280   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
21281   }
21282 
21283   if (!ExtraOp)
21284     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21285                        Op.getOperand(1));
21286   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21287                      Op.getOperand(1), Op.getOperand(2));
21288 }
21289 
LowerFSINCOS(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21290 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
21291                             SelectionDAG &DAG) {
21292   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
21293 
21294   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
21295   // which returns the values as { float, float } (in XMM0) or
21296   // { double, double } (which is returned in XMM0, XMM1).
21297   SDLoc dl(Op);
21298   SDValue Arg = Op.getOperand(0);
21299   EVT ArgVT = Arg.getValueType();
21300   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21301 
21302   TargetLowering::ArgListTy Args;
21303   TargetLowering::ArgListEntry Entry;
21304 
21305   Entry.Node = Arg;
21306   Entry.Ty = ArgTy;
21307   Entry.isSExt = false;
21308   Entry.isZExt = false;
21309   Args.push_back(Entry);
21310 
21311   bool isF64 = ArgVT == MVT::f64;
21312   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
21313   // the small struct {f32, f32} is returned in (eax, edx). For f64,
21314   // the results are returned via SRet in memory.
21315   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
21316   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21317   SDValue Callee =
21318       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
21319 
21320   Type *RetTy = isF64
21321     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
21322     : (Type*)VectorType::get(ArgTy, 4);
21323 
21324   TargetLowering::CallLoweringInfo CLI(DAG);
21325   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
21326     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
21327 
21328   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
21329 
21330   if (isF64)
21331     // Returned in xmm0 and xmm1.
21332     return CallResult.first;
21333 
21334   // Returned in bits 0:31 and 32:64 xmm0.
21335   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21336                                CallResult.first, DAG.getIntPtrConstant(0, dl));
21337   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21338                                CallResult.first, DAG.getIntPtrConstant(1, dl));
21339   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
21340   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
21341 }
21342 
21343 /// Widen a vector input to a vector of NVT.  The
21344 /// input vector must have the same element type as NVT.
ExtendToType(SDValue InOp,MVT NVT,SelectionDAG & DAG,bool FillWithZeroes=false)21345 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
21346                             bool FillWithZeroes = false) {
21347   // Check if InOp already has the right width.
21348   MVT InVT = InOp.getSimpleValueType();
21349   if (InVT == NVT)
21350     return InOp;
21351 
21352   if (InOp.isUndef())
21353     return DAG.getUNDEF(NVT);
21354 
21355   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
21356          "input and widen element type must match");
21357 
21358   unsigned InNumElts = InVT.getVectorNumElements();
21359   unsigned WidenNumElts = NVT.getVectorNumElements();
21360   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
21361          "Unexpected request for vector widening");
21362 
21363   EVT EltVT = NVT.getVectorElementType();
21364 
21365   SDLoc dl(InOp);
21366   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
21367       InOp.getNumOperands() == 2) {
21368     SDValue N1 = InOp.getOperand(1);
21369     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
21370         N1.isUndef()) {
21371       InOp = InOp.getOperand(0);
21372       InVT = InOp.getSimpleValueType();
21373       InNumElts = InVT.getVectorNumElements();
21374     }
21375   }
21376   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
21377       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
21378     SmallVector<SDValue, 16> Ops;
21379     for (unsigned i = 0; i < InNumElts; ++i)
21380       Ops.push_back(InOp.getOperand(i));
21381 
21382     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
21383       DAG.getUNDEF(EltVT);
21384     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
21385       Ops.push_back(FillVal);
21386     return DAG.getBuildVector(NVT, dl, Ops);
21387   }
21388   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
21389     DAG.getUNDEF(NVT);
21390   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
21391                      InOp, DAG.getIntPtrConstant(0, dl));
21392 }
21393 
LowerMSCATTER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21394 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
21395                              SelectionDAG &DAG) {
21396   assert(Subtarget.hasAVX512() &&
21397          "MGATHER/MSCATTER are supported on AVX-512 arch only");
21398 
21399   // X86 scatter kills mask register, so its type should be added to
21400   // the list of return values.
21401   // If the "scatter" has 2 return values, it is already handled.
21402   if (Op.getNode()->getNumValues() == 2)
21403     return Op;
21404 
21405   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
21406   SDValue Src = N->getValue();
21407   MVT VT = Src.getSimpleValueType();
21408   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
21409   SDLoc dl(Op);
21410 
21411   SDValue NewScatter;
21412   SDValue Index = N->getIndex();
21413   SDValue Mask = N->getMask();
21414   SDValue Chain = N->getChain();
21415   SDValue BasePtr = N->getBasePtr();
21416   MVT MemVT = N->getMemoryVT().getSimpleVT();
21417   MVT IndexVT = Index.getSimpleValueType();
21418   MVT MaskVT = Mask.getSimpleValueType();
21419 
21420   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
21421     // The v2i32 value was promoted to v2i64.
21422     // Now we "redo" the type legalizer's work and widen the original
21423     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
21424     // with a shuffle.
21425     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
21426            "Unexpected memory type");
21427     int ShuffleMask[] = {0, 2, -1, -1};
21428     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
21429                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
21430     // Now we have 4 elements instead of 2.
21431     // Expand the index.
21432     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
21433     Index = ExtendToType(Index, NewIndexVT, DAG);
21434 
21435     // Expand the mask with zeroes
21436     // Mask may be <2 x i64> or <2 x i1> at this moment
21437     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
21438            "Unexpected mask type");
21439     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
21440     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21441     VT = MVT::v4i32;
21442   }
21443 
21444   unsigned NumElts = VT.getVectorNumElements();
21445   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21446       !Index.getSimpleValueType().is512BitVector()) {
21447     // AVX512F supports only 512-bit vectors. Or data or index should
21448     // be 512 bit wide. If now the both index and data are 256-bit, but
21449     // the vector contains 8 elements, we just sign-extend the index
21450     if (IndexVT == MVT::v8i32)
21451       // Just extend index
21452       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21453     else {
21454       // The minimal number of elts in scatter is 8
21455       NumElts = 8;
21456       // Index
21457       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21458       // Use original index here, do not modify the index twice
21459       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
21460       if (IndexVT.getScalarType() == MVT::i32)
21461         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21462 
21463       // Mask
21464       // At this point we have promoted mask operand
21465       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21466       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21467       // Use the original mask here, do not modify the mask twice
21468       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
21469 
21470       // The value that should be stored
21471       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21472       Src = ExtendToType(Src, NewVT, DAG);
21473     }
21474   }
21475   // If the mask is "wide" at this point - truncate it to i1 vector
21476   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
21477   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
21478 
21479   // The mask is killed by scatter, add it to the values
21480   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
21481   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
21482   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
21483                                     N->getMemOperand());
21484   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
21485   return SDValue(NewScatter.getNode(), 1);
21486 }
21487 
LowerMLOAD(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21488 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
21489                           SelectionDAG &DAG) {
21490 
21491   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
21492   MVT VT = Op.getSimpleValueType();
21493   MVT ScalarVT = VT.getScalarType();
21494   SDValue Mask = N->getMask();
21495   SDLoc dl(Op);
21496 
21497   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21498          "Cannot lower masked load op.");
21499 
21500   assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21501           (Subtarget.hasBWI() &&
21502               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21503          "Unsupported masked load op.");
21504 
21505   // This operation is legal for targets with VLX, but without
21506   // VLX the vector should be widened to 512 bit
21507   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21508   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21509   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21510   SDValue Src0 = N->getSrc0();
21511   Src0 = ExtendToType(Src0, WideDataVT, DAG);
21512   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21513   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
21514                                       N->getBasePtr(), Mask, Src0,
21515                                       N->getMemoryVT(), N->getMemOperand(),
21516                                       N->getExtensionType());
21517 
21518   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21519                                NewLoad.getValue(0),
21520                                DAG.getIntPtrConstant(0, dl));
21521   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
21522   return DAG.getMergeValues(RetOps, dl);
21523 }
21524 
LowerMSTORE(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21525 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
21526                            SelectionDAG &DAG) {
21527   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
21528   SDValue DataToStore = N->getValue();
21529   MVT VT = DataToStore.getSimpleValueType();
21530   MVT ScalarVT = VT.getScalarType();
21531   SDValue Mask = N->getMask();
21532   SDLoc dl(Op);
21533 
21534   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21535          "Cannot lower masked store op.");
21536 
21537   assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21538           (Subtarget.hasBWI() &&
21539               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21540           "Unsupported masked store op.");
21541 
21542   // This operation is legal for targets with VLX, but without
21543   // VLX the vector should be widened to 512 bit
21544   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21545   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21546   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21547   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
21548   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21549   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
21550                             Mask, N->getMemoryVT(), N->getMemOperand(),
21551                             N->isTruncatingStore());
21552 }
21553 
LowerMGATHER(SDValue Op,const X86Subtarget & Subtarget,SelectionDAG & DAG)21554 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
21555                             SelectionDAG &DAG) {
21556   assert(Subtarget.hasAVX512() &&
21557          "MGATHER/MSCATTER are supported on AVX-512 arch only");
21558 
21559   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
21560   SDLoc dl(Op);
21561   MVT VT = Op.getSimpleValueType();
21562   SDValue Index = N->getIndex();
21563   SDValue Mask = N->getMask();
21564   SDValue Src0 = N->getValue();
21565   MVT IndexVT = Index.getSimpleValueType();
21566   MVT MaskVT = Mask.getSimpleValueType();
21567 
21568   unsigned NumElts = VT.getVectorNumElements();
21569   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
21570 
21571   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21572       !Index.getSimpleValueType().is512BitVector()) {
21573     // AVX512F supports only 512-bit vectors. Or data or index should
21574     // be 512 bit wide. If now the both index and data are 256-bit, but
21575     // the vector contains 8 elements, we just sign-extend the index
21576     if (NumElts == 8) {
21577       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21578       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
21579                         N->getOperand(3), Index };
21580       DAG.UpdateNodeOperands(N, Ops);
21581       return Op;
21582     }
21583 
21584     // Minimal number of elements in Gather
21585     NumElts = 8;
21586     // Index
21587     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21588     Index = ExtendToType(Index, NewIndexVT, DAG);
21589     if (IndexVT.getScalarType() == MVT::i32)
21590       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21591 
21592     // Mask
21593     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
21594     // At this point we have promoted mask operand
21595     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21596     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21597     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21598     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
21599 
21600     // The pass-thru value
21601     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21602     Src0 = ExtendToType(Src0, NewVT, DAG);
21603 
21604     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
21605     SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
21606                                             N->getMemoryVT(), dl, Ops,
21607                                             N->getMemOperand());
21608     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21609                                  NewGather.getValue(0),
21610                                  DAG.getIntPtrConstant(0, dl));
21611     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
21612     return DAG.getMergeValues(RetOps, dl);
21613   }
21614   return Op;
21615 }
21616 
LowerGC_TRANSITION_START(SDValue Op,SelectionDAG & DAG) const21617 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
21618                                                     SelectionDAG &DAG) const {
21619   // TODO: Eventually, the lowering of these nodes should be informed by or
21620   // deferred to the GC strategy for the function in which they appear. For
21621   // now, however, they must be lowered to something. Since they are logically
21622   // no-ops in the case of a null GC strategy (or a GC strategy which does not
21623   // require special handling for these nodes), lower them as literal NOOPs for
21624   // the time being.
21625   SmallVector<SDValue, 2> Ops;
21626 
21627   Ops.push_back(Op.getOperand(0));
21628   if (Op->getGluedNode())
21629     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21630 
21631   SDLoc OpDL(Op);
21632   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21633   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21634 
21635   return NOOP;
21636 }
21637 
LowerGC_TRANSITION_END(SDValue Op,SelectionDAG & DAG) const21638 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
21639                                                   SelectionDAG &DAG) const {
21640   // TODO: Eventually, the lowering of these nodes should be informed by or
21641   // deferred to the GC strategy for the function in which they appear. For
21642   // now, however, they must be lowered to something. Since they are logically
21643   // no-ops in the case of a null GC strategy (or a GC strategy which does not
21644   // require special handling for these nodes), lower them as literal NOOPs for
21645   // the time being.
21646   SmallVector<SDValue, 2> Ops;
21647 
21648   Ops.push_back(Op.getOperand(0));
21649   if (Op->getGluedNode())
21650     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21651 
21652   SDLoc OpDL(Op);
21653   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21654   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21655 
21656   return NOOP;
21657 }
21658 
21659 /// Provide custom lowering hooks for some operations.
LowerOperation(SDValue Op,SelectionDAG & DAG) const21660 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
21661   switch (Op.getOpcode()) {
21662   default: llvm_unreachable("Should not custom lower this!");
21663   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
21664   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
21665     return LowerCMP_SWAP(Op, Subtarget, DAG);
21666   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
21667   case ISD::ATOMIC_LOAD_ADD:
21668   case ISD::ATOMIC_LOAD_SUB:
21669   case ISD::ATOMIC_LOAD_OR:
21670   case ISD::ATOMIC_LOAD_XOR:
21671   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
21672   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
21673   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
21674   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
21675   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
21676   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
21677   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
21678   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
21679   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
21680   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
21681   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
21682   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
21683   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
21684   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
21685   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
21686   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
21687   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
21688   case ISD::SHL_PARTS:
21689   case ISD::SRA_PARTS:
21690   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
21691   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
21692   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
21693   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
21694   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
21695   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
21696   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
21697   case ISD::SIGN_EXTEND_VECTOR_INREG:
21698     return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
21699   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
21700   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
21701   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
21702   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
21703   case ISD::FABS:
21704   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
21705   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
21706   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
21707   case ISD::SETCC:              return LowerSETCC(Op, DAG);
21708   case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
21709   case ISD::SELECT:             return LowerSELECT(Op, DAG);
21710   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
21711   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
21712   case ISD::VASTART:            return LowerVASTART(Op, DAG);
21713   case ISD::VAARG:              return LowerVAARG(Op, DAG);
21714   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
21715   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
21716   case ISD::INTRINSIC_VOID:
21717   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
21718   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
21719   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
21720   case ISD::FRAME_TO_ARGS_OFFSET:
21721                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
21722   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
21723   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
21724   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
21725   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
21726   case ISD::EH_SJLJ_SETUP_DISPATCH:
21727     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
21728   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
21729   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
21730   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
21731   case ISD::CTLZ:
21732   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
21733   case ISD::CTTZ:
21734   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
21735   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
21736   case ISD::MULHS:
21737   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
21738   case ISD::UMUL_LOHI:
21739   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
21740   case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
21741   case ISD::SRA:
21742   case ISD::SRL:
21743   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
21744   case ISD::SADDO:
21745   case ISD::UADDO:
21746   case ISD::SSUBO:
21747   case ISD::USUBO:
21748   case ISD::SMULO:
21749   case ISD::UMULO:              return LowerXALUO(Op, DAG);
21750   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
21751   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
21752   case ISD::ADDC:
21753   case ISD::ADDE:
21754   case ISD::SUBC:
21755   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
21756   case ISD::ADD:                return LowerADD(Op, DAG);
21757   case ISD::SUB:                return LowerSUB(Op, DAG);
21758   case ISD::SMAX:
21759   case ISD::SMIN:
21760   case ISD::UMAX:
21761   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
21762   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
21763   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
21764   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
21765   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
21766   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
21767   case ISD::GC_TRANSITION_START:
21768                                 return LowerGC_TRANSITION_START(Op, DAG);
21769   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
21770   case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
21771   }
21772 }
21773 
21774 /// Places new result values for the node in Results (their number
21775 /// and types must exactly match those of the original return values of
21776 /// the node), or leaves Results empty, which indicates that the node is not
21777 /// to be custom lowered after all.
LowerOperationWrapper(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const21778 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
21779                                               SmallVectorImpl<SDValue> &Results,
21780                                               SelectionDAG &DAG) const {
21781   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
21782 
21783   if (!Res.getNode())
21784     return;
21785 
21786   assert((N->getNumValues() <= Res->getNumValues()) &&
21787       "Lowering returned the wrong number of results!");
21788 
21789   // Places new result values base on N result number.
21790   // In some cases (LowerSINT_TO_FP for example) Res has more result values
21791   // than original node, chain should be dropped(last value).
21792   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
21793       Results.push_back(Res.getValue(I));
21794 }
21795 
21796 /// Replace a node with an illegal result type with a new node built out of
21797 /// custom code.
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const21798 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
21799                                            SmallVectorImpl<SDValue>&Results,
21800                                            SelectionDAG &DAG) const {
21801   SDLoc dl(N);
21802   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21803   switch (N->getOpcode()) {
21804   default:
21805     llvm_unreachable("Do not know how to custom type legalize this operation!");
21806   case X86ISD::AVG: {
21807     // Legalize types for X86ISD::AVG by expanding vectors.
21808     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21809 
21810     auto InVT = N->getValueType(0);
21811     auto InVTSize = InVT.getSizeInBits();
21812     const unsigned RegSize =
21813         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
21814     assert((!Subtarget.hasAVX512() || RegSize < 512) &&
21815            "512-bit vector requires AVX512");
21816     assert((!Subtarget.hasAVX2() || RegSize < 256) &&
21817            "256-bit vector requires AVX2");
21818 
21819     auto ElemVT = InVT.getVectorElementType();
21820     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
21821                                   RegSize / ElemVT.getSizeInBits());
21822     assert(RegSize % InVT.getSizeInBits() == 0);
21823     unsigned NumConcat = RegSize / InVT.getSizeInBits();
21824 
21825     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
21826     Ops[0] = N->getOperand(0);
21827     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21828     Ops[0] = N->getOperand(1);
21829     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21830 
21831     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
21832     Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
21833                                   DAG.getIntPtrConstant(0, dl)));
21834     return;
21835   }
21836   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
21837   case X86ISD::FMINC:
21838   case X86ISD::FMIN:
21839   case X86ISD::FMAXC:
21840   case X86ISD::FMAX: {
21841     EVT VT = N->getValueType(0);
21842     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
21843     SDValue UNDEF = DAG.getUNDEF(VT);
21844     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21845                               N->getOperand(0), UNDEF);
21846     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21847                               N->getOperand(1), UNDEF);
21848     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
21849     return;
21850   }
21851   case ISD::SIGN_EXTEND_INREG:
21852   case ISD::ADDC:
21853   case ISD::ADDE:
21854   case ISD::SUBC:
21855   case ISD::SUBE:
21856     // We don't want to expand or promote these.
21857     return;
21858   case ISD::SDIV:
21859   case ISD::UDIV:
21860   case ISD::SREM:
21861   case ISD::UREM:
21862   case ISD::SDIVREM:
21863   case ISD::UDIVREM: {
21864     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
21865     Results.push_back(V);
21866     return;
21867   }
21868   case ISD::FP_TO_SINT:
21869   case ISD::FP_TO_UINT: {
21870     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
21871 
21872     std::pair<SDValue,SDValue> Vals =
21873         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
21874     SDValue FIST = Vals.first, StackSlot = Vals.second;
21875     if (FIST.getNode()) {
21876       EVT VT = N->getValueType(0);
21877       // Return a load from the stack slot.
21878       if (StackSlot.getNode())
21879         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
21880                                       MachinePointerInfo(),
21881                                       false, false, false, 0));
21882       else
21883         Results.push_back(FIST);
21884     }
21885     return;
21886   }
21887   case ISD::UINT_TO_FP: {
21888     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21889     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
21890         N->getValueType(0) != MVT::v2f32)
21891       return;
21892     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
21893                                  N->getOperand(0));
21894     SDValue VBias =
21895         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
21896     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
21897                              DAG.getBitcast(MVT::v2i64, VBias));
21898     Or = DAG.getBitcast(MVT::v2f64, Or);
21899     // TODO: Are there any fast-math-flags to propagate here?
21900     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
21901     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
21902     return;
21903   }
21904   case ISD::FP_ROUND: {
21905     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
21906         return;
21907     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
21908     Results.push_back(V);
21909     return;
21910   }
21911   case ISD::FP_EXTEND: {
21912     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
21913     // No other ValueType for FP_EXTEND should reach this point.
21914     assert(N->getValueType(0) == MVT::v2f32 &&
21915            "Do not know how to legalize this Node");
21916     return;
21917   }
21918   case ISD::INTRINSIC_W_CHAIN: {
21919     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
21920     switch (IntNo) {
21921     default : llvm_unreachable("Do not know how to custom type "
21922                                "legalize this intrinsic operation!");
21923     case Intrinsic::x86_rdtsc:
21924       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21925                                      Results);
21926     case Intrinsic::x86_rdtscp:
21927       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
21928                                      Results);
21929     case Intrinsic::x86_rdpmc:
21930       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
21931     }
21932   }
21933   case ISD::INTRINSIC_WO_CHAIN: {
21934     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
21935       Results.push_back(V);
21936     return;
21937   }
21938   case ISD::READCYCLECOUNTER: {
21939     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21940                                    Results);
21941   }
21942   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
21943     EVT T = N->getValueType(0);
21944     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
21945     bool Regs64bit = T == MVT::i128;
21946     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
21947     SDValue cpInL, cpInH;
21948     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21949                         DAG.getConstant(0, dl, HalfT));
21950     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21951                         DAG.getConstant(1, dl, HalfT));
21952     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
21953                              Regs64bit ? X86::RAX : X86::EAX,
21954                              cpInL, SDValue());
21955     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
21956                              Regs64bit ? X86::RDX : X86::EDX,
21957                              cpInH, cpInL.getValue(1));
21958     SDValue swapInL, swapInH;
21959     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21960                           DAG.getConstant(0, dl, HalfT));
21961     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21962                           DAG.getConstant(1, dl, HalfT));
21963     swapInH =
21964         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
21965                          swapInH, cpInH.getValue(1));
21966     // If the current function needs the base pointer, RBX,
21967     // we shouldn't use cmpxchg directly.
21968     // Indeed the lowering of that instruction will clobber
21969     // that register and since RBX will be a reserved register
21970     // the register allocator will not make sure its value will
21971     // be properly saved and restored around this live-range.
21972     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
21973     SDValue Result;
21974     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21975     unsigned BasePtr = TRI->getBaseRegister();
21976     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
21977     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
21978         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
21979       // ISel prefers the LCMPXCHG64 variant.
21980       // If that assert breaks, that means it is not the case anymore,
21981       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
21982       // not just EBX. This is a matter of accepting i64 input for that
21983       // pseudo, and restoring into the register of the right wide
21984       // in expand pseudo. Everything else should just work.
21985       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
21986              "Saving only half of the RBX");
21987       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
21988                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
21989       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
21990                                            Regs64bit ? X86::RBX : X86::EBX,
21991                                            HalfT, swapInH.getValue(1));
21992       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
21993                        RBXSave,
21994                        /*Glue*/ RBXSave.getValue(2)};
21995       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
21996     } else {
21997       unsigned Opcode =
21998           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
21999       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
22000                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
22001                                  swapInH.getValue(1));
22002       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
22003                        swapInL.getValue(1)};
22004       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
22005     }
22006     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
22007                                         Regs64bit ? X86::RAX : X86::EAX,
22008                                         HalfT, Result.getValue(1));
22009     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
22010                                         Regs64bit ? X86::RDX : X86::EDX,
22011                                         HalfT, cpOutL.getValue(2));
22012     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
22013 
22014     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
22015                                         MVT::i32, cpOutH.getValue(2));
22016     SDValue Success =
22017         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22018                     DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
22019     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
22020 
22021     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
22022     Results.push_back(Success);
22023     Results.push_back(EFLAGS.getValue(1));
22024     return;
22025   }
22026   case ISD::ATOMIC_SWAP:
22027   case ISD::ATOMIC_LOAD_ADD:
22028   case ISD::ATOMIC_LOAD_SUB:
22029   case ISD::ATOMIC_LOAD_AND:
22030   case ISD::ATOMIC_LOAD_OR:
22031   case ISD::ATOMIC_LOAD_XOR:
22032   case ISD::ATOMIC_LOAD_NAND:
22033   case ISD::ATOMIC_LOAD_MIN:
22034   case ISD::ATOMIC_LOAD_MAX:
22035   case ISD::ATOMIC_LOAD_UMIN:
22036   case ISD::ATOMIC_LOAD_UMAX:
22037   case ISD::ATOMIC_LOAD: {
22038     // Delegate to generic TypeLegalization. Situations we can really handle
22039     // should have already been dealt with by AtomicExpandPass.cpp.
22040     break;
22041   }
22042   case ISD::BITCAST: {
22043     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22044     EVT DstVT = N->getValueType(0);
22045     EVT SrcVT = N->getOperand(0)->getValueType(0);
22046 
22047     if (SrcVT != MVT::f64 ||
22048         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
22049       return;
22050 
22051     unsigned NumElts = DstVT.getVectorNumElements();
22052     EVT SVT = DstVT.getVectorElementType();
22053     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22054     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
22055                                    MVT::v2f64, N->getOperand(0));
22056     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
22057 
22058     if (ExperimentalVectorWideningLegalization) {
22059       // If we are legalizing vectors by widening, we already have the desired
22060       // legal vector type, just return it.
22061       Results.push_back(ToVecInt);
22062       return;
22063     }
22064 
22065     SmallVector<SDValue, 8> Elts;
22066     for (unsigned i = 0, e = NumElts; i != e; ++i)
22067       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
22068                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
22069 
22070     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
22071   }
22072   }
22073 }
22074 
getTargetNodeName(unsigned Opcode) const22075 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
22076   switch ((X86ISD::NodeType)Opcode) {
22077   case X86ISD::FIRST_NUMBER:       break;
22078   case X86ISD::BSF:                return "X86ISD::BSF";
22079   case X86ISD::BSR:                return "X86ISD::BSR";
22080   case X86ISD::SHLD:               return "X86ISD::SHLD";
22081   case X86ISD::SHRD:               return "X86ISD::SHRD";
22082   case X86ISD::FAND:               return "X86ISD::FAND";
22083   case X86ISD::FANDN:              return "X86ISD::FANDN";
22084   case X86ISD::FOR:                return "X86ISD::FOR";
22085   case X86ISD::FXOR:               return "X86ISD::FXOR";
22086   case X86ISD::FILD:               return "X86ISD::FILD";
22087   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
22088   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
22089   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
22090   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
22091   case X86ISD::FLD:                return "X86ISD::FLD";
22092   case X86ISD::FST:                return "X86ISD::FST";
22093   case X86ISD::CALL:               return "X86ISD::CALL";
22094   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
22095   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
22096   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
22097   case X86ISD::BT:                 return "X86ISD::BT";
22098   case X86ISD::CMP:                return "X86ISD::CMP";
22099   case X86ISD::COMI:               return "X86ISD::COMI";
22100   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
22101   case X86ISD::CMPM:               return "X86ISD::CMPM";
22102   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
22103   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
22104   case X86ISD::SETCC:              return "X86ISD::SETCC";
22105   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
22106   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
22107   case X86ISD::CMOV:               return "X86ISD::CMOV";
22108   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
22109   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
22110   case X86ISD::IRET:               return "X86ISD::IRET";
22111   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
22112   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
22113   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
22114   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
22115   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
22116   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
22117   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
22118   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
22119   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
22120   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
22121   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
22122   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
22123   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
22124   case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
22125   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
22126   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
22127   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
22128   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
22129   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
22130   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
22131   case X86ISD::HADD:               return "X86ISD::HADD";
22132   case X86ISD::HSUB:               return "X86ISD::HSUB";
22133   case X86ISD::FHADD:              return "X86ISD::FHADD";
22134   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
22135   case X86ISD::ABS:                return "X86ISD::ABS";
22136   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
22137   case X86ISD::FMAX:               return "X86ISD::FMAX";
22138   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
22139   case X86ISD::FMIN:               return "X86ISD::FMIN";
22140   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
22141   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
22142   case X86ISD::FMINC:              return "X86ISD::FMINC";
22143   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
22144   case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
22145   case X86ISD::FRCP:               return "X86ISD::FRCP";
22146   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
22147   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
22148   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
22149   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
22150   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
22151   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
22152   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
22153   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
22154   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
22155     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
22156   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
22157   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
22158   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
22159   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
22160   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
22161   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
22162   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
22163   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
22164     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
22165   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
22166     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
22167   case X86ISD::LADD:               return "X86ISD::LADD";
22168   case X86ISD::LSUB:               return "X86ISD::LSUB";
22169   case X86ISD::LOR:                return "X86ISD::LOR";
22170   case X86ISD::LXOR:               return "X86ISD::LXOR";
22171   case X86ISD::LAND:               return "X86ISD::LAND";
22172   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
22173   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
22174   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
22175   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
22176   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
22177   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
22178   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
22179   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
22180   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
22181   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
22182   case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
22183   case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
22184   case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
22185   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
22186   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
22187   case X86ISD::VSHL:               return "X86ISD::VSHL";
22188   case X86ISD::VSRL:               return "X86ISD::VSRL";
22189   case X86ISD::VSRA:               return "X86ISD::VSRA";
22190   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
22191   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
22192   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
22193   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
22194   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
22195   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
22196   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
22197   case X86ISD::CMPP:               return "X86ISD::CMPP";
22198   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
22199   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
22200   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
22201   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
22202   case X86ISD::ADD:                return "X86ISD::ADD";
22203   case X86ISD::SUB:                return "X86ISD::SUB";
22204   case X86ISD::ADC:                return "X86ISD::ADC";
22205   case X86ISD::SBB:                return "X86ISD::SBB";
22206   case X86ISD::SMUL:               return "X86ISD::SMUL";
22207   case X86ISD::UMUL:               return "X86ISD::UMUL";
22208   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
22209   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
22210   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
22211   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
22212   case X86ISD::INC:                return "X86ISD::INC";
22213   case X86ISD::DEC:                return "X86ISD::DEC";
22214   case X86ISD::OR:                 return "X86ISD::OR";
22215   case X86ISD::XOR:                return "X86ISD::XOR";
22216   case X86ISD::AND:                return "X86ISD::AND";
22217   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
22218   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
22219   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
22220   case X86ISD::PTEST:              return "X86ISD::PTEST";
22221   case X86ISD::TESTP:              return "X86ISD::TESTP";
22222   case X86ISD::TESTM:              return "X86ISD::TESTM";
22223   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
22224   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
22225   case X86ISD::KTEST:              return "X86ISD::KTEST";
22226   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
22227   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
22228   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
22229   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
22230   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
22231   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
22232   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
22233   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
22234   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
22235   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
22236   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
22237   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
22238   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
22239   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
22240   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
22241   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
22242   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
22243   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
22244   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
22245   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
22246   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
22247   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
22248   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
22249   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
22250   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
22251   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
22252   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
22253   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
22254   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
22255   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
22256   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
22257   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
22258   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
22259   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
22260   case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
22261   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
22262   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
22263   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
22264   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
22265   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
22266   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
22267   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
22268   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
22269   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
22270   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
22271   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
22272   case X86ISD::SAHF:               return "X86ISD::SAHF";
22273   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
22274   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
22275   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
22276   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
22277   case X86ISD::VPROT:              return "X86ISD::VPROT";
22278   case X86ISD::VPROTI:             return "X86ISD::VPROTI";
22279   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
22280   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
22281   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
22282   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
22283   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
22284   case X86ISD::FMADD:              return "X86ISD::FMADD";
22285   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
22286   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
22287   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
22288   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
22289   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
22290   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
22291   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
22292   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
22293   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
22294   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
22295   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
22296   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
22297   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
22298   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
22299   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
22300   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
22301   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
22302   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
22303   case X86ISD::XTEST:              return "X86ISD::XTEST";
22304   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
22305   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
22306   case X86ISD::SELECT:             return "X86ISD::SELECT";
22307   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
22308   case X86ISD::RCP28:              return "X86ISD::RCP28";
22309   case X86ISD::EXP2:               return "X86ISD::EXP2";
22310   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
22311   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
22312   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
22313   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
22314   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
22315   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
22316   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
22317   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
22318   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
22319   case X86ISD::ADDS:               return "X86ISD::ADDS";
22320   case X86ISD::SUBS:               return "X86ISD::SUBS";
22321   case X86ISD::AVG:                return "X86ISD::AVG";
22322   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
22323   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
22324   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
22325   case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
22326   case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
22327   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
22328   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
22329   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
22330   case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
22331   case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
22332   }
22333   return nullptr;
22334 }
22335 
22336 /// Return true if the addressing mode represented by AM is legal for this
22337 /// target, for a load/store of the specified type.
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS) const22338 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
22339                                               const AddrMode &AM, Type *Ty,
22340                                               unsigned AS) const {
22341   // X86 supports extremely general addressing modes.
22342   CodeModel::Model M = getTargetMachine().getCodeModel();
22343 
22344   // X86 allows a sign-extended 32-bit immediate field as a displacement.
22345   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
22346     return false;
22347 
22348   if (AM.BaseGV) {
22349     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
22350 
22351     // If a reference to this global requires an extra load, we can't fold it.
22352     if (isGlobalStubReference(GVFlags))
22353       return false;
22354 
22355     // If BaseGV requires a register for the PIC base, we cannot also have a
22356     // BaseReg specified.
22357     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
22358       return false;
22359 
22360     // If lower 4G is not available, then we must use rip-relative addressing.
22361     if ((M != CodeModel::Small || isPositionIndependent()) &&
22362         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
22363       return false;
22364   }
22365 
22366   switch (AM.Scale) {
22367   case 0:
22368   case 1:
22369   case 2:
22370   case 4:
22371   case 8:
22372     // These scales always work.
22373     break;
22374   case 3:
22375   case 5:
22376   case 9:
22377     // These scales are formed with basereg+scalereg.  Only accept if there is
22378     // no basereg yet.
22379     if (AM.HasBaseReg)
22380       return false;
22381     break;
22382   default:  // Other stuff never works.
22383     return false;
22384   }
22385 
22386   return true;
22387 }
22388 
isVectorShiftByScalarCheap(Type * Ty) const22389 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
22390   unsigned Bits = Ty->getScalarSizeInBits();
22391 
22392   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
22393   // particularly cheaper than those without.
22394   if (Bits == 8)
22395     return false;
22396 
22397   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
22398   // variable shifts just as cheap as scalar ones.
22399   if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
22400     return false;
22401 
22402   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
22403   // fully general vector.
22404   return true;
22405 }
22406 
isTruncateFree(Type * Ty1,Type * Ty2) const22407 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
22408   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22409     return false;
22410   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
22411   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
22412   return NumBits1 > NumBits2;
22413 }
22414 
allowTruncateForTailCall(Type * Ty1,Type * Ty2) const22415 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
22416   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22417     return false;
22418 
22419   if (!isTypeLegal(EVT::getEVT(Ty1)))
22420     return false;
22421 
22422   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
22423 
22424   // Assuming the caller doesn't have a zeroext or signext return parameter,
22425   // truncation all the way down to i1 is valid.
22426   return true;
22427 }
22428 
isLegalICmpImmediate(int64_t Imm) const22429 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
22430   return isInt<32>(Imm);
22431 }
22432 
isLegalAddImmediate(int64_t Imm) const22433 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
22434   // Can also use sub to handle negated immediates.
22435   return isInt<32>(Imm);
22436 }
22437 
isTruncateFree(EVT VT1,EVT VT2) const22438 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
22439   if (!VT1.isInteger() || !VT2.isInteger())
22440     return false;
22441   unsigned NumBits1 = VT1.getSizeInBits();
22442   unsigned NumBits2 = VT2.getSizeInBits();
22443   return NumBits1 > NumBits2;
22444 }
22445 
isZExtFree(Type * Ty1,Type * Ty2) const22446 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
22447   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22448   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
22449 }
22450 
isZExtFree(EVT VT1,EVT VT2) const22451 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
22452   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22453   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
22454 }
22455 
isZExtFree(SDValue Val,EVT VT2) const22456 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
22457   EVT VT1 = Val.getValueType();
22458   if (isZExtFree(VT1, VT2))
22459     return true;
22460 
22461   if (Val.getOpcode() != ISD::LOAD)
22462     return false;
22463 
22464   if (!VT1.isSimple() || !VT1.isInteger() ||
22465       !VT2.isSimple() || !VT2.isInteger())
22466     return false;
22467 
22468   switch (VT1.getSimpleVT().SimpleTy) {
22469   default: break;
22470   case MVT::i8:
22471   case MVT::i16:
22472   case MVT::i32:
22473     // X86 has 8, 16, and 32-bit zero-extending loads.
22474     return true;
22475   }
22476 
22477   return false;
22478 }
22479 
isVectorLoadExtDesirable(SDValue) const22480 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
22481 
22482 bool
isFMAFasterThanFMulAndFAdd(EVT VT) const22483 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
22484   if (!Subtarget.hasAnyFMA())
22485     return false;
22486 
22487   VT = VT.getScalarType();
22488 
22489   if (!VT.isSimple())
22490     return false;
22491 
22492   switch (VT.getSimpleVT().SimpleTy) {
22493   case MVT::f32:
22494   case MVT::f64:
22495     return true;
22496   default:
22497     break;
22498   }
22499 
22500   return false;
22501 }
22502 
isNarrowingProfitable(EVT VT1,EVT VT2) const22503 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
22504   // i16 instructions are longer (0x66 prefix) and potentially slower.
22505   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
22506 }
22507 
22508 /// Targets can use this to indicate that they only support *some*
22509 /// VECTOR_SHUFFLE operations, those with specific masks.
22510 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
22511 /// are assumed to be legal.
22512 bool
isShuffleMaskLegal(const SmallVectorImpl<int> & M,EVT VT) const22513 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
22514                                       EVT VT) const {
22515   if (!VT.isSimple())
22516     return false;
22517 
22518   // Not for i1 vectors
22519   if (VT.getSimpleVT().getScalarType() == MVT::i1)
22520     return false;
22521 
22522   // Very little shuffling can be done for 64-bit vectors right now.
22523   if (VT.getSimpleVT().getSizeInBits() == 64)
22524     return false;
22525 
22526   // We only care that the types being shuffled are legal. The lowering can
22527   // handle any possible shuffle mask that results.
22528   return isTypeLegal(VT.getSimpleVT());
22529 }
22530 
22531 bool
isVectorClearMaskLegal(const SmallVectorImpl<int> & Mask,EVT VT) const22532 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
22533                                           EVT VT) const {
22534   // Just delegate to the generic legality, clear masks aren't special.
22535   return isShuffleMaskLegal(Mask, VT);
22536 }
22537 
22538 //===----------------------------------------------------------------------===//
22539 //                           X86 Scheduler Hooks
22540 //===----------------------------------------------------------------------===//
22541 
22542 /// Utility function to emit xbegin specifying the start of an RTM region.
emitXBegin(MachineInstr & MI,MachineBasicBlock * MBB,const TargetInstrInfo * TII)22543 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
22544                                      const TargetInstrInfo *TII) {
22545   DebugLoc DL = MI.getDebugLoc();
22546 
22547   const BasicBlock *BB = MBB->getBasicBlock();
22548   MachineFunction::iterator I = ++MBB->getIterator();
22549 
22550   // For the v = xbegin(), we generate
22551   //
22552   // thisMBB:
22553   //  xbegin sinkMBB
22554   //
22555   // mainMBB:
22556   //  eax = -1
22557   //
22558   // sinkMBB:
22559   //  v = eax
22560 
22561   MachineBasicBlock *thisMBB = MBB;
22562   MachineFunction *MF = MBB->getParent();
22563   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
22564   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
22565   MF->insert(I, mainMBB);
22566   MF->insert(I, sinkMBB);
22567 
22568   // Transfer the remainder of BB and its successor edges to sinkMBB.
22569   sinkMBB->splice(sinkMBB->begin(), MBB,
22570                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
22571   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
22572 
22573   // thisMBB:
22574   //  xbegin sinkMBB
22575   //  # fallthrough to mainMBB
22576   //  # abortion to sinkMBB
22577   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
22578   thisMBB->addSuccessor(mainMBB);
22579   thisMBB->addSuccessor(sinkMBB);
22580 
22581   // mainMBB:
22582   //  EAX = -1
22583   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
22584   mainMBB->addSuccessor(sinkMBB);
22585 
22586   // sinkMBB:
22587   // EAX is live into the sinkMBB
22588   sinkMBB->addLiveIn(X86::EAX);
22589   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
22590           MI.getOperand(0).getReg())
22591       .addReg(X86::EAX);
22592 
22593   MI.eraseFromParent();
22594   return sinkMBB;
22595 }
22596 
22597 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
22598 // or XMM0_V32I8 in AVX all of this code can be replaced with that
22599 // in the .td file.
emitPCMPSTRM(MachineInstr & MI,MachineBasicBlock * BB,const TargetInstrInfo * TII)22600 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
22601                                        const TargetInstrInfo *TII) {
22602   unsigned Opc;
22603   switch (MI.getOpcode()) {
22604   default: llvm_unreachable("illegal opcode!");
22605   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
22606   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
22607   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
22608   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
22609   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
22610   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
22611   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
22612   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
22613   }
22614 
22615   DebugLoc dl = MI.getDebugLoc();
22616   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22617 
22618   unsigned NumArgs = MI.getNumOperands();
22619   for (unsigned i = 1; i < NumArgs; ++i) {
22620     MachineOperand &Op = MI.getOperand(i);
22621     if (!(Op.isReg() && Op.isImplicit()))
22622       MIB.addOperand(Op);
22623   }
22624   if (MI.hasOneMemOperand())
22625     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22626 
22627   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22628       .addReg(X86::XMM0);
22629 
22630   MI.eraseFromParent();
22631   return BB;
22632 }
22633 
22634 // FIXME: Custom handling because TableGen doesn't support multiple implicit
22635 // defs in an instruction pattern
emitPCMPSTRI(MachineInstr & MI,MachineBasicBlock * BB,const TargetInstrInfo * TII)22636 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
22637                                        const TargetInstrInfo *TII) {
22638   unsigned Opc;
22639   switch (MI.getOpcode()) {
22640   default: llvm_unreachable("illegal opcode!");
22641   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
22642   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
22643   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
22644   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
22645   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
22646   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
22647   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
22648   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
22649   }
22650 
22651   DebugLoc dl = MI.getDebugLoc();
22652   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22653 
22654   unsigned NumArgs = MI.getNumOperands(); // remove the results
22655   for (unsigned i = 1; i < NumArgs; ++i) {
22656     MachineOperand &Op = MI.getOperand(i);
22657     if (!(Op.isReg() && Op.isImplicit()))
22658       MIB.addOperand(Op);
22659   }
22660   if (MI.hasOneMemOperand())
22661     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22662 
22663   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22664       .addReg(X86::ECX);
22665 
22666   MI.eraseFromParent();
22667   return BB;
22668 }
22669 
emitWRPKRU(MachineInstr & MI,MachineBasicBlock * BB,const X86Subtarget & Subtarget)22670 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22671                                      const X86Subtarget &Subtarget) {
22672   DebugLoc dl = MI.getDebugLoc();
22673   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22674 
22675   // insert input VAL into EAX
22676   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
22677       .addReg(MI.getOperand(0).getReg());
22678   // insert zero to ECX
22679   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22680 
22681   // insert zero to EDX
22682   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
22683 
22684   // insert WRPKRU instruction
22685   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
22686 
22687   MI.eraseFromParent(); // The pseudo is gone now.
22688   return BB;
22689 }
22690 
emitRDPKRU(MachineInstr & MI,MachineBasicBlock * BB,const X86Subtarget & Subtarget)22691 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22692                                      const X86Subtarget &Subtarget) {
22693   DebugLoc dl = MI.getDebugLoc();
22694   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22695 
22696   // insert zero to ECX
22697   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22698 
22699   // insert RDPKRU instruction
22700   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
22701   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22702       .addReg(X86::EAX);
22703 
22704   MI.eraseFromParent(); // The pseudo is gone now.
22705   return BB;
22706 }
22707 
emitMonitor(MachineInstr & MI,MachineBasicBlock * BB,const X86Subtarget & Subtarget,unsigned Opc)22708 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
22709                                       const X86Subtarget &Subtarget,
22710                                       unsigned Opc) {
22711   DebugLoc dl = MI.getDebugLoc();
22712   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22713   // Address into RAX/EAX, other two args into ECX, EDX.
22714   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
22715   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
22716   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
22717   for (int i = 0; i < X86::AddrNumOperands; ++i)
22718     MIB.addOperand(MI.getOperand(i));
22719 
22720   unsigned ValOps = X86::AddrNumOperands;
22721   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
22722       .addReg(MI.getOperand(ValOps).getReg());
22723   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
22724       .addReg(MI.getOperand(ValOps + 1).getReg());
22725 
22726   // The instruction doesn't actually take any operands though.
22727   BuildMI(*BB, MI, dl, TII->get(Opc));
22728 
22729   MI.eraseFromParent(); // The pseudo is gone now.
22730   return BB;
22731 }
22732 
22733 MachineBasicBlock *
EmitVAARG64WithCustomInserter(MachineInstr & MI,MachineBasicBlock * MBB) const22734 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
22735                                                  MachineBasicBlock *MBB) const {
22736   // Emit va_arg instruction on X86-64.
22737 
22738   // Operands to this pseudo-instruction:
22739   // 0  ) Output        : destination address (reg)
22740   // 1-5) Input         : va_list address (addr, i64mem)
22741   // 6  ) ArgSize       : Size (in bytes) of vararg type
22742   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
22743   // 8  ) Align         : Alignment of type
22744   // 9  ) EFLAGS (implicit-def)
22745 
22746   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
22747   static_assert(X86::AddrNumOperands == 5,
22748                 "VAARG_64 assumes 5 address operands");
22749 
22750   unsigned DestReg = MI.getOperand(0).getReg();
22751   MachineOperand &Base = MI.getOperand(1);
22752   MachineOperand &Scale = MI.getOperand(2);
22753   MachineOperand &Index = MI.getOperand(3);
22754   MachineOperand &Disp = MI.getOperand(4);
22755   MachineOperand &Segment = MI.getOperand(5);
22756   unsigned ArgSize = MI.getOperand(6).getImm();
22757   unsigned ArgMode = MI.getOperand(7).getImm();
22758   unsigned Align = MI.getOperand(8).getImm();
22759 
22760   // Memory Reference
22761   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
22762   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
22763   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
22764 
22765   // Machine Information
22766   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22767   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
22768   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
22769   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
22770   DebugLoc DL = MI.getDebugLoc();
22771 
22772   // struct va_list {
22773   //   i32   gp_offset
22774   //   i32   fp_offset
22775   //   i64   overflow_area (address)
22776   //   i64   reg_save_area (address)
22777   // }
22778   // sizeof(va_list) = 24
22779   // alignment(va_list) = 8
22780 
22781   unsigned TotalNumIntRegs = 6;
22782   unsigned TotalNumXMMRegs = 8;
22783   bool UseGPOffset = (ArgMode == 1);
22784   bool UseFPOffset = (ArgMode == 2);
22785   unsigned MaxOffset = TotalNumIntRegs * 8 +
22786                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
22787 
22788   /* Align ArgSize to a multiple of 8 */
22789   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
22790   bool NeedsAlign = (Align > 8);
22791 
22792   MachineBasicBlock *thisMBB = MBB;
22793   MachineBasicBlock *overflowMBB;
22794   MachineBasicBlock *offsetMBB;
22795   MachineBasicBlock *endMBB;
22796 
22797   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
22798   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
22799   unsigned OffsetReg = 0;
22800 
22801   if (!UseGPOffset && !UseFPOffset) {
22802     // If we only pull from the overflow region, we don't create a branch.
22803     // We don't need to alter control flow.
22804     OffsetDestReg = 0; // unused
22805     OverflowDestReg = DestReg;
22806 
22807     offsetMBB = nullptr;
22808     overflowMBB = thisMBB;
22809     endMBB = thisMBB;
22810   } else {
22811     // First emit code to check if gp_offset (or fp_offset) is below the bound.
22812     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
22813     // If not, pull from overflow_area. (branch to overflowMBB)
22814     //
22815     //       thisMBB
22816     //         |     .
22817     //         |        .
22818     //     offsetMBB   overflowMBB
22819     //         |        .
22820     //         |     .
22821     //        endMBB
22822 
22823     // Registers for the PHI in endMBB
22824     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
22825     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
22826 
22827     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
22828     MachineFunction *MF = MBB->getParent();
22829     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22830     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22831     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22832 
22833     MachineFunction::iterator MBBIter = ++MBB->getIterator();
22834 
22835     // Insert the new basic blocks
22836     MF->insert(MBBIter, offsetMBB);
22837     MF->insert(MBBIter, overflowMBB);
22838     MF->insert(MBBIter, endMBB);
22839 
22840     // Transfer the remainder of MBB and its successor edges to endMBB.
22841     endMBB->splice(endMBB->begin(), thisMBB,
22842                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
22843     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
22844 
22845     // Make offsetMBB and overflowMBB successors of thisMBB
22846     thisMBB->addSuccessor(offsetMBB);
22847     thisMBB->addSuccessor(overflowMBB);
22848 
22849     // endMBB is a successor of both offsetMBB and overflowMBB
22850     offsetMBB->addSuccessor(endMBB);
22851     overflowMBB->addSuccessor(endMBB);
22852 
22853     // Load the offset value into a register
22854     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22855     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
22856       .addOperand(Base)
22857       .addOperand(Scale)
22858       .addOperand(Index)
22859       .addDisp(Disp, UseFPOffset ? 4 : 0)
22860       .addOperand(Segment)
22861       .setMemRefs(MMOBegin, MMOEnd);
22862 
22863     // Check if there is enough room left to pull this argument.
22864     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
22865       .addReg(OffsetReg)
22866       .addImm(MaxOffset + 8 - ArgSizeA8);
22867 
22868     // Branch to "overflowMBB" if offset >= max
22869     // Fall through to "offsetMBB" otherwise
22870     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
22871       .addMBB(overflowMBB);
22872   }
22873 
22874   // In offsetMBB, emit code to use the reg_save_area.
22875   if (offsetMBB) {
22876     assert(OffsetReg != 0);
22877 
22878     // Read the reg_save_area address.
22879     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
22880     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
22881       .addOperand(Base)
22882       .addOperand(Scale)
22883       .addOperand(Index)
22884       .addDisp(Disp, 16)
22885       .addOperand(Segment)
22886       .setMemRefs(MMOBegin, MMOEnd);
22887 
22888     // Zero-extend the offset
22889     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
22890       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
22891         .addImm(0)
22892         .addReg(OffsetReg)
22893         .addImm(X86::sub_32bit);
22894 
22895     // Add the offset to the reg_save_area to get the final address.
22896     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
22897       .addReg(OffsetReg64)
22898       .addReg(RegSaveReg);
22899 
22900     // Compute the offset for the next argument
22901     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22902     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
22903       .addReg(OffsetReg)
22904       .addImm(UseFPOffset ? 16 : 8);
22905 
22906     // Store it back into the va_list.
22907     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
22908       .addOperand(Base)
22909       .addOperand(Scale)
22910       .addOperand(Index)
22911       .addDisp(Disp, UseFPOffset ? 4 : 0)
22912       .addOperand(Segment)
22913       .addReg(NextOffsetReg)
22914       .setMemRefs(MMOBegin, MMOEnd);
22915 
22916     // Jump to endMBB
22917     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
22918       .addMBB(endMBB);
22919   }
22920 
22921   //
22922   // Emit code to use overflow area
22923   //
22924 
22925   // Load the overflow_area address into a register.
22926   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
22927   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
22928     .addOperand(Base)
22929     .addOperand(Scale)
22930     .addOperand(Index)
22931     .addDisp(Disp, 8)
22932     .addOperand(Segment)
22933     .setMemRefs(MMOBegin, MMOEnd);
22934 
22935   // If we need to align it, do so. Otherwise, just copy the address
22936   // to OverflowDestReg.
22937   if (NeedsAlign) {
22938     // Align the overflow address
22939     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
22940     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
22941 
22942     // aligned_addr = (addr + (align-1)) & ~(align-1)
22943     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
22944       .addReg(OverflowAddrReg)
22945       .addImm(Align-1);
22946 
22947     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
22948       .addReg(TmpReg)
22949       .addImm(~(uint64_t)(Align-1));
22950   } else {
22951     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
22952       .addReg(OverflowAddrReg);
22953   }
22954 
22955   // Compute the next overflow address after this argument.
22956   // (the overflow address should be kept 8-byte aligned)
22957   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
22958   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
22959     .addReg(OverflowDestReg)
22960     .addImm(ArgSizeA8);
22961 
22962   // Store the new overflow address.
22963   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
22964     .addOperand(Base)
22965     .addOperand(Scale)
22966     .addOperand(Index)
22967     .addDisp(Disp, 8)
22968     .addOperand(Segment)
22969     .addReg(NextAddrReg)
22970     .setMemRefs(MMOBegin, MMOEnd);
22971 
22972   // If we branched, emit the PHI to the front of endMBB.
22973   if (offsetMBB) {
22974     BuildMI(*endMBB, endMBB->begin(), DL,
22975             TII->get(X86::PHI), DestReg)
22976       .addReg(OffsetDestReg).addMBB(offsetMBB)
22977       .addReg(OverflowDestReg).addMBB(overflowMBB);
22978   }
22979 
22980   // Erase the pseudo instruction
22981   MI.eraseFromParent();
22982 
22983   return endMBB;
22984 }
22985 
EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr & MI,MachineBasicBlock * MBB) const22986 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
22987     MachineInstr &MI, MachineBasicBlock *MBB) const {
22988   // Emit code to save XMM registers to the stack. The ABI says that the
22989   // number of registers to save is given in %al, so it's theoretically
22990   // possible to do an indirect jump trick to avoid saving all of them,
22991   // however this code takes a simpler approach and just executes all
22992   // of the stores if %al is non-zero. It's less code, and it's probably
22993   // easier on the hardware branch predictor, and stores aren't all that
22994   // expensive anyway.
22995 
22996   // Create the new basic blocks. One block contains all the XMM stores,
22997   // and one block is the final destination regardless of whether any
22998   // stores were performed.
22999   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
23000   MachineFunction *F = MBB->getParent();
23001   MachineFunction::iterator MBBIter = ++MBB->getIterator();
23002   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
23003   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
23004   F->insert(MBBIter, XMMSaveMBB);
23005   F->insert(MBBIter, EndMBB);
23006 
23007   // Transfer the remainder of MBB and its successor edges to EndMBB.
23008   EndMBB->splice(EndMBB->begin(), MBB,
23009                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23010   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
23011 
23012   // The original block will now fall through to the XMM save block.
23013   MBB->addSuccessor(XMMSaveMBB);
23014   // The XMMSaveMBB will fall through to the end block.
23015   XMMSaveMBB->addSuccessor(EndMBB);
23016 
23017   // Now add the instructions.
23018   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23019   DebugLoc DL = MI.getDebugLoc();
23020 
23021   unsigned CountReg = MI.getOperand(0).getReg();
23022   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
23023   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
23024 
23025   if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
23026     // If %al is 0, branch around the XMM save block.
23027     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
23028     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
23029     MBB->addSuccessor(EndMBB);
23030   }
23031 
23032   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
23033   // that was just emitted, but clearly shouldn't be "saved".
23034   assert((MI.getNumOperands() <= 3 ||
23035           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
23036           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
23037          "Expected last argument to be EFLAGS");
23038   unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
23039   // In the XMM save block, save all the XMM argument registers.
23040   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
23041     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
23042     MachineMemOperand *MMO = F->getMachineMemOperand(
23043         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
23044         MachineMemOperand::MOStore,
23045         /*Size=*/16, /*Align=*/16);
23046     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
23047         .addFrameIndex(RegSaveFrameIndex)
23048         .addImm(/*Scale=*/1)
23049         .addReg(/*IndexReg=*/0)
23050         .addImm(/*Disp=*/Offset)
23051         .addReg(/*Segment=*/0)
23052         .addReg(MI.getOperand(i).getReg())
23053         .addMemOperand(MMO);
23054   }
23055 
23056   MI.eraseFromParent(); // The pseudo instruction is gone now.
23057 
23058   return EndMBB;
23059 }
23060 
23061 // The EFLAGS operand of SelectItr might be missing a kill marker
23062 // because there were multiple uses of EFLAGS, and ISel didn't know
23063 // which to mark. Figure out whether SelectItr should have had a
23064 // kill marker, and set it if it should. Returns the correct kill
23065 // marker value.
checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,MachineBasicBlock * BB,const TargetRegisterInfo * TRI)23066 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
23067                                      MachineBasicBlock* BB,
23068                                      const TargetRegisterInfo* TRI) {
23069   // Scan forward through BB for a use/def of EFLAGS.
23070   MachineBasicBlock::iterator miI(std::next(SelectItr));
23071   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
23072     const MachineInstr& mi = *miI;
23073     if (mi.readsRegister(X86::EFLAGS))
23074       return false;
23075     if (mi.definesRegister(X86::EFLAGS))
23076       break; // Should have kill-flag - update below.
23077   }
23078 
23079   // If we hit the end of the block, check whether EFLAGS is live into a
23080   // successor.
23081   if (miI == BB->end()) {
23082     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
23083                                           sEnd = BB->succ_end();
23084          sItr != sEnd; ++sItr) {
23085       MachineBasicBlock* succ = *sItr;
23086       if (succ->isLiveIn(X86::EFLAGS))
23087         return false;
23088     }
23089   }
23090 
23091   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
23092   // out. SelectMI should have a kill flag on EFLAGS.
23093   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
23094   return true;
23095 }
23096 
23097 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
23098 // together with other CMOV pseudo-opcodes into a single basic-block with
23099 // conditional jump around it.
isCMOVPseudo(MachineInstr & MI)23100 static bool isCMOVPseudo(MachineInstr &MI) {
23101   switch (MI.getOpcode()) {
23102   case X86::CMOV_FR32:
23103   case X86::CMOV_FR64:
23104   case X86::CMOV_GR8:
23105   case X86::CMOV_GR16:
23106   case X86::CMOV_GR32:
23107   case X86::CMOV_RFP32:
23108   case X86::CMOV_RFP64:
23109   case X86::CMOV_RFP80:
23110   case X86::CMOV_V2F64:
23111   case X86::CMOV_V2I64:
23112   case X86::CMOV_V4F32:
23113   case X86::CMOV_V4F64:
23114   case X86::CMOV_V4I64:
23115   case X86::CMOV_V16F32:
23116   case X86::CMOV_V8F32:
23117   case X86::CMOV_V8F64:
23118   case X86::CMOV_V8I64:
23119   case X86::CMOV_V8I1:
23120   case X86::CMOV_V16I1:
23121   case X86::CMOV_V32I1:
23122   case X86::CMOV_V64I1:
23123     return true;
23124 
23125   default:
23126     return false;
23127   }
23128 }
23129 
23130 MachineBasicBlock *
EmitLoweredSelect(MachineInstr & MI,MachineBasicBlock * BB) const23131 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
23132                                      MachineBasicBlock *BB) const {
23133   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23134   DebugLoc DL = MI.getDebugLoc();
23135 
23136   // To "insert" a SELECT_CC instruction, we actually have to insert the
23137   // diamond control-flow pattern.  The incoming instruction knows the
23138   // destination vreg to set, the condition code register to branch on, the
23139   // true/false values to select between, and a branch opcode to use.
23140   const BasicBlock *LLVM_BB = BB->getBasicBlock();
23141   MachineFunction::iterator It = ++BB->getIterator();
23142 
23143   //  thisMBB:
23144   //  ...
23145   //   TrueVal = ...
23146   //   cmpTY ccX, r1, r2
23147   //   bCC copy1MBB
23148   //   fallthrough --> copy0MBB
23149   MachineBasicBlock *thisMBB = BB;
23150   MachineFunction *F = BB->getParent();
23151 
23152   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
23153   // as described above, by inserting a BB, and then making a PHI at the join
23154   // point to select the true and false operands of the CMOV in the PHI.
23155   //
23156   // The code also handles two different cases of multiple CMOV opcodes
23157   // in a row.
23158   //
23159   // Case 1:
23160   // In this case, there are multiple CMOVs in a row, all which are based on
23161   // the same condition setting (or the exact opposite condition setting).
23162   // In this case we can lower all the CMOVs using a single inserted BB, and
23163   // then make a number of PHIs at the join point to model the CMOVs. The only
23164   // trickiness here, is that in a case like:
23165   //
23166   // t2 = CMOV cond1 t1, f1
23167   // t3 = CMOV cond1 t2, f2
23168   //
23169   // when rewriting this into PHIs, we have to perform some renaming on the
23170   // temps since you cannot have a PHI operand refer to a PHI result earlier
23171   // in the same block.  The "simple" but wrong lowering would be:
23172   //
23173   // t2 = PHI t1(BB1), f1(BB2)
23174   // t3 = PHI t2(BB1), f2(BB2)
23175   //
23176   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
23177   // renaming is to note that on the path through BB1, t2 is really just a
23178   // copy of t1, and do that renaming, properly generating:
23179   //
23180   // t2 = PHI t1(BB1), f1(BB2)
23181   // t3 = PHI t1(BB1), f2(BB2)
23182   //
23183   // Case 2, we lower cascaded CMOVs such as
23184   //
23185   //   (CMOV (CMOV F, T, cc1), T, cc2)
23186   //
23187   // to two successives branches.  For that, we look for another CMOV as the
23188   // following instruction.
23189   //
23190   // Without this, we would add a PHI between the two jumps, which ends up
23191   // creating a few copies all around. For instance, for
23192   //
23193   //    (sitofp (zext (fcmp une)))
23194   //
23195   // we would generate:
23196   //
23197   //         ucomiss %xmm1, %xmm0
23198   //         movss  <1.0f>, %xmm0
23199   //         movaps  %xmm0, %xmm1
23200   //         jne     .LBB5_2
23201   //         xorps   %xmm1, %xmm1
23202   // .LBB5_2:
23203   //         jp      .LBB5_4
23204   //         movaps  %xmm1, %xmm0
23205   // .LBB5_4:
23206   //         retq
23207   //
23208   // because this custom-inserter would have generated:
23209   //
23210   //   A
23211   //   | \
23212   //   |  B
23213   //   | /
23214   //   C
23215   //   | \
23216   //   |  D
23217   //   | /
23218   //   E
23219   //
23220   // A: X = ...; Y = ...
23221   // B: empty
23222   // C: Z = PHI [X, A], [Y, B]
23223   // D: empty
23224   // E: PHI [X, C], [Z, D]
23225   //
23226   // If we lower both CMOVs in a single step, we can instead generate:
23227   //
23228   //   A
23229   //   | \
23230   //   |  C
23231   //   | /|
23232   //   |/ |
23233   //   |  |
23234   //   |  D
23235   //   | /
23236   //   E
23237   //
23238   // A: X = ...; Y = ...
23239   // D: empty
23240   // E: PHI [X, A], [X, C], [Y, D]
23241   //
23242   // Which, in our sitofp/fcmp example, gives us something like:
23243   //
23244   //         ucomiss %xmm1, %xmm0
23245   //         movss  <1.0f>, %xmm0
23246   //         jne     .LBB5_4
23247   //         jp      .LBB5_4
23248   //         xorps   %xmm0, %xmm0
23249   // .LBB5_4:
23250   //         retq
23251   //
23252   MachineInstr *CascadedCMOV = nullptr;
23253   MachineInstr *LastCMOV = &MI;
23254   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
23255   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
23256   MachineBasicBlock::iterator NextMIIt =
23257       std::next(MachineBasicBlock::iterator(MI));
23258 
23259   // Check for case 1, where there are multiple CMOVs with the same condition
23260   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
23261   // number of jumps the most.
23262 
23263   if (isCMOVPseudo(MI)) {
23264     // See if we have a string of CMOVS with the same condition.
23265     while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
23266            (NextMIIt->getOperand(3).getImm() == CC ||
23267             NextMIIt->getOperand(3).getImm() == OppCC)) {
23268       LastCMOV = &*NextMIIt;
23269       ++NextMIIt;
23270     }
23271   }
23272 
23273   // This checks for case 2, but only do this if we didn't already find
23274   // case 1, as indicated by LastCMOV == MI.
23275   if (LastCMOV == &MI && NextMIIt != BB->end() &&
23276       NextMIIt->getOpcode() == MI.getOpcode() &&
23277       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
23278       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
23279       NextMIIt->getOperand(1).isKill()) {
23280     CascadedCMOV = &*NextMIIt;
23281   }
23282 
23283   MachineBasicBlock *jcc1MBB = nullptr;
23284 
23285   // If we have a cascaded CMOV, we lower it to two successive branches to
23286   // the same block.  EFLAGS is used by both, so mark it as live in the second.
23287   if (CascadedCMOV) {
23288     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
23289     F->insert(It, jcc1MBB);
23290     jcc1MBB->addLiveIn(X86::EFLAGS);
23291   }
23292 
23293   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
23294   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
23295   F->insert(It, copy0MBB);
23296   F->insert(It, sinkMBB);
23297 
23298   // If the EFLAGS register isn't dead in the terminator, then claim that it's
23299   // live into the sink and copy blocks.
23300   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
23301 
23302   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
23303   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
23304       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
23305     copy0MBB->addLiveIn(X86::EFLAGS);
23306     sinkMBB->addLiveIn(X86::EFLAGS);
23307   }
23308 
23309   // Transfer the remainder of BB and its successor edges to sinkMBB.
23310   sinkMBB->splice(sinkMBB->begin(), BB,
23311                   std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
23312   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
23313 
23314   // Add the true and fallthrough blocks as its successors.
23315   if (CascadedCMOV) {
23316     // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
23317     BB->addSuccessor(jcc1MBB);
23318 
23319     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
23320     // jump to the sinkMBB.
23321     jcc1MBB->addSuccessor(copy0MBB);
23322     jcc1MBB->addSuccessor(sinkMBB);
23323   } else {
23324     BB->addSuccessor(copy0MBB);
23325   }
23326 
23327   // The true block target of the first (or only) branch is always sinkMBB.
23328   BB->addSuccessor(sinkMBB);
23329 
23330   // Create the conditional branch instruction.
23331   unsigned Opc = X86::GetCondBranchFromCond(CC);
23332   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
23333 
23334   if (CascadedCMOV) {
23335     unsigned Opc2 = X86::GetCondBranchFromCond(
23336         (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
23337     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
23338   }
23339 
23340   //  copy0MBB:
23341   //   %FalseValue = ...
23342   //   # fallthrough to sinkMBB
23343   copy0MBB->addSuccessor(sinkMBB);
23344 
23345   //  sinkMBB:
23346   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
23347   //  ...
23348   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
23349   MachineBasicBlock::iterator MIItEnd =
23350     std::next(MachineBasicBlock::iterator(LastCMOV));
23351   MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
23352   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
23353   MachineInstrBuilder MIB;
23354 
23355   // As we are creating the PHIs, we have to be careful if there is more than
23356   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
23357   // PHIs have to reference the individual true/false inputs from earlier PHIs.
23358   // That also means that PHI construction must work forward from earlier to
23359   // later, and that the code must maintain a mapping from earlier PHI's
23360   // destination registers, and the registers that went into the PHI.
23361 
23362   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
23363     unsigned DestReg = MIIt->getOperand(0).getReg();
23364     unsigned Op1Reg = MIIt->getOperand(1).getReg();
23365     unsigned Op2Reg = MIIt->getOperand(2).getReg();
23366 
23367     // If this CMOV we are generating is the opposite condition from
23368     // the jump we generated, then we have to swap the operands for the
23369     // PHI that is going to be generated.
23370     if (MIIt->getOperand(3).getImm() == OppCC)
23371         std::swap(Op1Reg, Op2Reg);
23372 
23373     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
23374       Op1Reg = RegRewriteTable[Op1Reg].first;
23375 
23376     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
23377       Op2Reg = RegRewriteTable[Op2Reg].second;
23378 
23379     MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
23380                   TII->get(X86::PHI), DestReg)
23381           .addReg(Op1Reg).addMBB(copy0MBB)
23382           .addReg(Op2Reg).addMBB(thisMBB);
23383 
23384     // Add this PHI to the rewrite table.
23385     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
23386   }
23387 
23388   // If we have a cascaded CMOV, the second Jcc provides the same incoming
23389   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
23390   if (CascadedCMOV) {
23391     MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
23392     // Copy the PHI result to the register defined by the second CMOV.
23393     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
23394             DL, TII->get(TargetOpcode::COPY),
23395             CascadedCMOV->getOperand(0).getReg())
23396         .addReg(MI.getOperand(0).getReg());
23397     CascadedCMOV->eraseFromParent();
23398   }
23399 
23400   // Now remove the CMOV(s).
23401   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
23402     (MIIt++)->eraseFromParent();
23403 
23404   return sinkMBB;
23405 }
23406 
23407 MachineBasicBlock *
EmitLoweredAtomicFP(MachineInstr & MI,MachineBasicBlock * BB) const23408 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
23409                                        MachineBasicBlock *BB) const {
23410   // Combine the following atomic floating-point modification pattern:
23411   //   a.store(reg OP a.load(acquire), release)
23412   // Transform them into:
23413   //   OPss (%gpr), %xmm
23414   //   movss %xmm, (%gpr)
23415   // Or sd equivalent for 64-bit operations.
23416   unsigned MOp, FOp;
23417   switch (MI.getOpcode()) {
23418   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
23419   case X86::RELEASE_FADD32mr:
23420     FOp = X86::ADDSSrm;
23421     MOp = X86::MOVSSmr;
23422     break;
23423   case X86::RELEASE_FADD64mr:
23424     FOp = X86::ADDSDrm;
23425     MOp = X86::MOVSDmr;
23426     break;
23427   }
23428   const X86InstrInfo *TII = Subtarget.getInstrInfo();
23429   DebugLoc DL = MI.getDebugLoc();
23430   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
23431   unsigned ValOpIdx = X86::AddrNumOperands;
23432   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
23433   MachineInstrBuilder MIB =
23434       BuildMI(*BB, MI, DL, TII->get(FOp),
23435               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
23436           .addReg(VSrc);
23437   for (int i = 0; i < X86::AddrNumOperands; ++i) {
23438     MachineOperand &Operand = MI.getOperand(i);
23439     // Clear any kill flags on register operands as we'll create a second
23440     // instruction using the same address operands.
23441     if (Operand.isReg())
23442       Operand.setIsKill(false);
23443     MIB.addOperand(Operand);
23444   }
23445   MachineInstr *FOpMI = MIB;
23446   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
23447   for (int i = 0; i < X86::AddrNumOperands; ++i)
23448     MIB.addOperand(MI.getOperand(i));
23449   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
23450   MI.eraseFromParent(); // The pseudo instruction is gone now.
23451   return BB;
23452 }
23453 
23454 MachineBasicBlock *
EmitLoweredSegAlloca(MachineInstr & MI,MachineBasicBlock * BB) const23455 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
23456                                         MachineBasicBlock *BB) const {
23457   MachineFunction *MF = BB->getParent();
23458   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23459   DebugLoc DL = MI.getDebugLoc();
23460   const BasicBlock *LLVM_BB = BB->getBasicBlock();
23461 
23462   assert(MF->shouldSplitStack());
23463 
23464   const bool Is64Bit = Subtarget.is64Bit();
23465   const bool IsLP64 = Subtarget.isTarget64BitLP64();
23466 
23467   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
23468   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
23469 
23470   // BB:
23471   //  ... [Till the alloca]
23472   // If stacklet is not large enough, jump to mallocMBB
23473   //
23474   // bumpMBB:
23475   //  Allocate by subtracting from RSP
23476   //  Jump to continueMBB
23477   //
23478   // mallocMBB:
23479   //  Allocate by call to runtime
23480   //
23481   // continueMBB:
23482   //  ...
23483   //  [rest of original BB]
23484   //
23485 
23486   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23487   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23488   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23489 
23490   MachineRegisterInfo &MRI = MF->getRegInfo();
23491   const TargetRegisterClass *AddrRegClass =
23492       getRegClassFor(getPointerTy(MF->getDataLayout()));
23493 
23494   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23495            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23496            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
23497            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
23498            sizeVReg = MI.getOperand(1).getReg(),
23499            physSPReg =
23500                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
23501 
23502   MachineFunction::iterator MBBIter = ++BB->getIterator();
23503 
23504   MF->insert(MBBIter, bumpMBB);
23505   MF->insert(MBBIter, mallocMBB);
23506   MF->insert(MBBIter, continueMBB);
23507 
23508   continueMBB->splice(continueMBB->begin(), BB,
23509                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
23510   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
23511 
23512   // Add code to the main basic block to check if the stack limit has been hit,
23513   // and if so, jump to mallocMBB otherwise to bumpMBB.
23514   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
23515   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
23516     .addReg(tmpSPVReg).addReg(sizeVReg);
23517   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
23518     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
23519     .addReg(SPLimitVReg);
23520   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
23521 
23522   // bumpMBB simply decreases the stack pointer, since we know the current
23523   // stacklet has enough space.
23524   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
23525     .addReg(SPLimitVReg);
23526   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
23527     .addReg(SPLimitVReg);
23528   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23529 
23530   // Calls into a routine in libgcc to allocate more space from the heap.
23531   const uint32_t *RegMask =
23532       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
23533   if (IsLP64) {
23534     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
23535       .addReg(sizeVReg);
23536     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23537       .addExternalSymbol("__morestack_allocate_stack_space")
23538       .addRegMask(RegMask)
23539       .addReg(X86::RDI, RegState::Implicit)
23540       .addReg(X86::RAX, RegState::ImplicitDefine);
23541   } else if (Is64Bit) {
23542     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
23543       .addReg(sizeVReg);
23544     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23545       .addExternalSymbol("__morestack_allocate_stack_space")
23546       .addRegMask(RegMask)
23547       .addReg(X86::EDI, RegState::Implicit)
23548       .addReg(X86::EAX, RegState::ImplicitDefine);
23549   } else {
23550     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
23551       .addImm(12);
23552     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
23553     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
23554       .addExternalSymbol("__morestack_allocate_stack_space")
23555       .addRegMask(RegMask)
23556       .addReg(X86::EAX, RegState::ImplicitDefine);
23557   }
23558 
23559   if (!Is64Bit)
23560     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
23561       .addImm(16);
23562 
23563   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
23564     .addReg(IsLP64 ? X86::RAX : X86::EAX);
23565   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23566 
23567   // Set up the CFG correctly.
23568   BB->addSuccessor(bumpMBB);
23569   BB->addSuccessor(mallocMBB);
23570   mallocMBB->addSuccessor(continueMBB);
23571   bumpMBB->addSuccessor(continueMBB);
23572 
23573   // Take care of the PHI nodes.
23574   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
23575           MI.getOperand(0).getReg())
23576       .addReg(mallocPtrVReg)
23577       .addMBB(mallocMBB)
23578       .addReg(bumpSPPtrVReg)
23579       .addMBB(bumpMBB);
23580 
23581   // Delete the original pseudo instruction.
23582   MI.eraseFromParent();
23583 
23584   // And we're done.
23585   return continueMBB;
23586 }
23587 
23588 MachineBasicBlock *
EmitLoweredCatchRet(MachineInstr & MI,MachineBasicBlock * BB) const23589 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
23590                                        MachineBasicBlock *BB) const {
23591   MachineFunction *MF = BB->getParent();
23592   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23593   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
23594   DebugLoc DL = MI.getDebugLoc();
23595 
23596   assert(!isAsynchronousEHPersonality(
23597              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
23598          "SEH does not use catchret!");
23599 
23600   // Only 32-bit EH needs to worry about manually restoring stack pointers.
23601   if (!Subtarget.is32Bit())
23602     return BB;
23603 
23604   // C++ EH creates a new target block to hold the restore code, and wires up
23605   // the new block to the return destination with a normal JMP_4.
23606   MachineBasicBlock *RestoreMBB =
23607       MF->CreateMachineBasicBlock(BB->getBasicBlock());
23608   assert(BB->succ_size() == 1);
23609   MF->insert(std::next(BB->getIterator()), RestoreMBB);
23610   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
23611   BB->addSuccessor(RestoreMBB);
23612   MI.getOperand(0).setMBB(RestoreMBB);
23613 
23614   auto RestoreMBBI = RestoreMBB->begin();
23615   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
23616   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
23617   return BB;
23618 }
23619 
23620 MachineBasicBlock *
EmitLoweredCatchPad(MachineInstr & MI,MachineBasicBlock * BB) const23621 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
23622                                        MachineBasicBlock *BB) const {
23623   MachineFunction *MF = BB->getParent();
23624   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
23625   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
23626   // Only 32-bit SEH requires special handling for catchpad.
23627   if (IsSEH && Subtarget.is32Bit()) {
23628     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23629     DebugLoc DL = MI.getDebugLoc();
23630     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
23631   }
23632   MI.eraseFromParent();
23633   return BB;
23634 }
23635 
23636 MachineBasicBlock *
EmitLoweredTLSAddr(MachineInstr & MI,MachineBasicBlock * BB) const23637 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
23638                                       MachineBasicBlock *BB) const {
23639   // So, here we replace TLSADDR with the sequence:
23640   // adjust_stackdown -> TLSADDR -> adjust_stackup.
23641   // We need this because TLSADDR is lowered into calls
23642   // inside MC, therefore without the two markers shrink-wrapping
23643   // may push the prologue/epilogue pass them.
23644   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23645   DebugLoc DL = MI.getDebugLoc();
23646   MachineFunction &MF = *BB->getParent();
23647 
23648   // Emit CALLSEQ_START right before the instruction.
23649   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
23650   MachineInstrBuilder CallseqStart =
23651     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
23652   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
23653 
23654   // Emit CALLSEQ_END right after the instruction.
23655   // We don't call erase from parent because we want to keep the
23656   // original instruction around.
23657   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
23658   MachineInstrBuilder CallseqEnd =
23659     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
23660   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
23661 
23662   return BB;
23663 }
23664 
23665 MachineBasicBlock *
EmitLoweredTLSCall(MachineInstr & MI,MachineBasicBlock * BB) const23666 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
23667                                       MachineBasicBlock *BB) const {
23668   // This is pretty easy.  We're taking the value that we received from
23669   // our load from the relocation, sticking it in either RDI (x86-64)
23670   // or EAX and doing an indirect call.  The return value will then
23671   // be in the normal return register.
23672   MachineFunction *F = BB->getParent();
23673   const X86InstrInfo *TII = Subtarget.getInstrInfo();
23674   DebugLoc DL = MI.getDebugLoc();
23675 
23676   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
23677   assert(MI.getOperand(3).isGlobal() && "This should be a global");
23678 
23679   // Get a register mask for the lowered call.
23680   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
23681   // proper register mask.
23682   const uint32_t *RegMask =
23683       Subtarget.is64Bit() ?
23684       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
23685       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
23686   if (Subtarget.is64Bit()) {
23687     MachineInstrBuilder MIB =
23688         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
23689             .addReg(X86::RIP)
23690             .addImm(0)
23691             .addReg(0)
23692             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23693                               MI.getOperand(3).getTargetFlags())
23694             .addReg(0);
23695     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
23696     addDirectMem(MIB, X86::RDI);
23697     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
23698   } else if (!isPositionIndependent()) {
23699     MachineInstrBuilder MIB =
23700         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23701             .addReg(0)
23702             .addImm(0)
23703             .addReg(0)
23704             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23705                               MI.getOperand(3).getTargetFlags())
23706             .addReg(0);
23707     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23708     addDirectMem(MIB, X86::EAX);
23709     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23710   } else {
23711     MachineInstrBuilder MIB =
23712         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23713             .addReg(TII->getGlobalBaseReg(F))
23714             .addImm(0)
23715             .addReg(0)
23716             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23717                               MI.getOperand(3).getTargetFlags())
23718             .addReg(0);
23719     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23720     addDirectMem(MIB, X86::EAX);
23721     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23722   }
23723 
23724   MI.eraseFromParent(); // The pseudo instruction is gone now.
23725   return BB;
23726 }
23727 
23728 MachineBasicBlock *
emitEHSjLjSetJmp(MachineInstr & MI,MachineBasicBlock * MBB) const23729 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
23730                                     MachineBasicBlock *MBB) const {
23731   DebugLoc DL = MI.getDebugLoc();
23732   MachineFunction *MF = MBB->getParent();
23733   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23734   MachineRegisterInfo &MRI = MF->getRegInfo();
23735 
23736   const BasicBlock *BB = MBB->getBasicBlock();
23737   MachineFunction::iterator I = ++MBB->getIterator();
23738 
23739   // Memory Reference
23740   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23741   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23742 
23743   unsigned DstReg;
23744   unsigned MemOpndSlot = 0;
23745 
23746   unsigned CurOp = 0;
23747 
23748   DstReg = MI.getOperand(CurOp++).getReg();
23749   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
23750   assert(RC->hasType(MVT::i32) && "Invalid destination!");
23751   unsigned mainDstReg = MRI.createVirtualRegister(RC);
23752   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
23753 
23754   MemOpndSlot = CurOp;
23755 
23756   MVT PVT = getPointerTy(MF->getDataLayout());
23757   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23758          "Invalid Pointer Size!");
23759 
23760   // For v = setjmp(buf), we generate
23761   //
23762   // thisMBB:
23763   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
23764   //  SjLjSetup restoreMBB
23765   //
23766   // mainMBB:
23767   //  v_main = 0
23768   //
23769   // sinkMBB:
23770   //  v = phi(main, restore)
23771   //
23772   // restoreMBB:
23773   //  if base pointer being used, load it from frame
23774   //  v_restore = 1
23775 
23776   MachineBasicBlock *thisMBB = MBB;
23777   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23778   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23779   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
23780   MF->insert(I, mainMBB);
23781   MF->insert(I, sinkMBB);
23782   MF->push_back(restoreMBB);
23783   restoreMBB->setHasAddressTaken();
23784 
23785   MachineInstrBuilder MIB;
23786 
23787   // Transfer the remainder of BB and its successor edges to sinkMBB.
23788   sinkMBB->splice(sinkMBB->begin(), MBB,
23789                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23790   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23791 
23792   // thisMBB:
23793   unsigned PtrStoreOpc = 0;
23794   unsigned LabelReg = 0;
23795   const int64_t LabelOffset = 1 * PVT.getStoreSize();
23796   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23797                      !isPositionIndependent();
23798 
23799   // Prepare IP either in reg or imm.
23800   if (!UseImmLabel) {
23801     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23802     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
23803     LabelReg = MRI.createVirtualRegister(PtrRC);
23804     if (Subtarget.is64Bit()) {
23805       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
23806               .addReg(X86::RIP)
23807               .addImm(0)
23808               .addReg(0)
23809               .addMBB(restoreMBB)
23810               .addReg(0);
23811     } else {
23812       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
23813       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
23814               .addReg(XII->getGlobalBaseReg(MF))
23815               .addImm(0)
23816               .addReg(0)
23817               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
23818               .addReg(0);
23819     }
23820   } else
23821     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23822   // Store IP
23823   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
23824   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23825     if (i == X86::AddrDisp)
23826       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
23827     else
23828       MIB.addOperand(MI.getOperand(MemOpndSlot + i));
23829   }
23830   if (!UseImmLabel)
23831     MIB.addReg(LabelReg);
23832   else
23833     MIB.addMBB(restoreMBB);
23834   MIB.setMemRefs(MMOBegin, MMOEnd);
23835   // Setup
23836   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
23837           .addMBB(restoreMBB);
23838 
23839   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23840   MIB.addRegMask(RegInfo->getNoPreservedMask());
23841   thisMBB->addSuccessor(mainMBB);
23842   thisMBB->addSuccessor(restoreMBB);
23843 
23844   // mainMBB:
23845   //  EAX = 0
23846   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
23847   mainMBB->addSuccessor(sinkMBB);
23848 
23849   // sinkMBB:
23850   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
23851           TII->get(X86::PHI), DstReg)
23852     .addReg(mainDstReg).addMBB(mainMBB)
23853     .addReg(restoreDstReg).addMBB(restoreMBB);
23854 
23855   // restoreMBB:
23856   if (RegInfo->hasBasePointer(*MF)) {
23857     const bool Uses64BitFramePtr =
23858         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
23859     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
23860     X86FI->setRestoreBasePointer(MF);
23861     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
23862     unsigned BasePtr = RegInfo->getBaseRegister();
23863     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
23864     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
23865                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
23866       .setMIFlag(MachineInstr::FrameSetup);
23867   }
23868   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
23869   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
23870   restoreMBB->addSuccessor(sinkMBB);
23871 
23872   MI.eraseFromParent();
23873   return sinkMBB;
23874 }
23875 
23876 MachineBasicBlock *
emitEHSjLjLongJmp(MachineInstr & MI,MachineBasicBlock * MBB) const23877 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
23878                                      MachineBasicBlock *MBB) const {
23879   DebugLoc DL = MI.getDebugLoc();
23880   MachineFunction *MF = MBB->getParent();
23881   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23882   MachineRegisterInfo &MRI = MF->getRegInfo();
23883 
23884   // Memory Reference
23885   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23886   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23887 
23888   MVT PVT = getPointerTy(MF->getDataLayout());
23889   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23890          "Invalid Pointer Size!");
23891 
23892   const TargetRegisterClass *RC =
23893     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23894   unsigned Tmp = MRI.createVirtualRegister(RC);
23895   // Since FP is only updated here but NOT referenced, it's treated as GPR.
23896   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23897   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
23898   unsigned SP = RegInfo->getStackRegister();
23899 
23900   MachineInstrBuilder MIB;
23901 
23902   const int64_t LabelOffset = 1 * PVT.getStoreSize();
23903   const int64_t SPOffset = 2 * PVT.getStoreSize();
23904 
23905   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
23906   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
23907 
23908   // Reload FP
23909   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
23910   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
23911     MIB.addOperand(MI.getOperand(i));
23912   MIB.setMemRefs(MMOBegin, MMOEnd);
23913   // Reload IP
23914   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
23915   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23916     if (i == X86::AddrDisp)
23917       MIB.addDisp(MI.getOperand(i), LabelOffset);
23918     else
23919       MIB.addOperand(MI.getOperand(i));
23920   }
23921   MIB.setMemRefs(MMOBegin, MMOEnd);
23922   // Reload SP
23923   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
23924   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23925     if (i == X86::AddrDisp)
23926       MIB.addDisp(MI.getOperand(i), SPOffset);
23927     else
23928       MIB.addOperand(MI.getOperand(i));
23929   }
23930   MIB.setMemRefs(MMOBegin, MMOEnd);
23931   // Jump
23932   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
23933 
23934   MI.eraseFromParent();
23935   return MBB;
23936 }
23937 
SetupEntryBlockForSjLj(MachineInstr & MI,MachineBasicBlock * MBB,MachineBasicBlock * DispatchBB,int FI) const23938 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
23939                                                MachineBasicBlock *MBB,
23940                                                MachineBasicBlock *DispatchBB,
23941                                                int FI) const {
23942   DebugLoc DL = MI.getDebugLoc();
23943   MachineFunction *MF = MBB->getParent();
23944   MachineRegisterInfo *MRI = &MF->getRegInfo();
23945   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23946 
23947   MVT PVT = getPointerTy(MF->getDataLayout());
23948   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
23949 
23950   unsigned Op = 0;
23951   unsigned VR = 0;
23952 
23953   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23954                      !isPositionIndependent();
23955 
23956   if (UseImmLabel) {
23957     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23958   } else {
23959     const TargetRegisterClass *TRC =
23960         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23961     VR = MRI->createVirtualRegister(TRC);
23962     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23963 
23964     /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
23965 
23966     if (Subtarget.is64Bit())
23967       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
23968           .addReg(X86::RIP)
23969           .addImm(1)
23970           .addReg(0)
23971           .addMBB(DispatchBB)
23972           .addReg(0);
23973     else
23974       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
23975           .addReg(0) /* XII->getGlobalBaseReg(MF) */
23976           .addImm(1)
23977           .addReg(0)
23978           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
23979           .addReg(0);
23980   }
23981 
23982   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
23983   addFrameReference(MIB, FI, 36);
23984   if (UseImmLabel)
23985     MIB.addMBB(DispatchBB);
23986   else
23987     MIB.addReg(VR);
23988 }
23989 
23990 MachineBasicBlock *
EmitSjLjDispatchBlock(MachineInstr & MI,MachineBasicBlock * BB) const23991 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
23992                                          MachineBasicBlock *BB) const {
23993   DebugLoc DL = MI.getDebugLoc();
23994   MachineFunction *MF = BB->getParent();
23995   MachineModuleInfo *MMI = &MF->getMMI();
23996   MachineFrameInfo *MFI = MF->getFrameInfo();
23997   MachineRegisterInfo *MRI = &MF->getRegInfo();
23998   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23999   int FI = MFI->getFunctionContextIndex();
24000 
24001   // Get a mapping of the call site numbers to all of the landing pads they're
24002   // associated with.
24003   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
24004   unsigned MaxCSNum = 0;
24005   for (auto &MBB : *MF) {
24006     if (!MBB.isEHPad())
24007       continue;
24008 
24009     MCSymbol *Sym = nullptr;
24010     for (const auto &MI : MBB) {
24011       if (MI.isDebugValue())
24012         continue;
24013 
24014       assert(MI.isEHLabel() && "expected EH_LABEL");
24015       Sym = MI.getOperand(0).getMCSymbol();
24016       break;
24017     }
24018 
24019     if (!MMI->hasCallSiteLandingPad(Sym))
24020       continue;
24021 
24022     for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
24023       CallSiteNumToLPad[CSI].push_back(&MBB);
24024       MaxCSNum = std::max(MaxCSNum, CSI);
24025     }
24026   }
24027 
24028   // Get an ordered list of the machine basic blocks for the jump table.
24029   std::vector<MachineBasicBlock *> LPadList;
24030   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
24031   LPadList.reserve(CallSiteNumToLPad.size());
24032 
24033   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
24034     for (auto &LP : CallSiteNumToLPad[CSI]) {
24035       LPadList.push_back(LP);
24036       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
24037     }
24038   }
24039 
24040   assert(!LPadList.empty() &&
24041          "No landing pad destinations for the dispatch jump table!");
24042 
24043   // Create the MBBs for the dispatch code.
24044 
24045   // Shove the dispatch's address into the return slot in the function context.
24046   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
24047   DispatchBB->setIsEHPad(true);
24048 
24049   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
24050   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
24051   DispatchBB->addSuccessor(TrapBB);
24052 
24053   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
24054   DispatchBB->addSuccessor(DispContBB);
24055 
24056   // Insert MBBs.
24057   MF->push_back(DispatchBB);
24058   MF->push_back(DispContBB);
24059   MF->push_back(TrapBB);
24060 
24061   // Insert code into the entry block that creates and registers the function
24062   // context.
24063   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
24064 
24065   // Create the jump table and associated information
24066   MachineJumpTableInfo *JTI =
24067       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
24068   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
24069 
24070   const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
24071   const X86RegisterInfo &RI = XII->getRegisterInfo();
24072 
24073   // Add a register mask with no preserved registers.  This results in all
24074   // registers being marked as clobbered.
24075   if (RI.hasBasePointer(*MF)) {
24076     const bool FPIs64Bit =
24077         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
24078     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
24079     MFI->setRestoreBasePointer(MF);
24080 
24081     unsigned FP = RI.getFrameRegister(*MF);
24082     unsigned BP = RI.getBaseRegister();
24083     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
24084     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
24085                  MFI->getRestoreBasePointerOffset())
24086         .addRegMask(RI.getNoPreservedMask());
24087   } else {
24088     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
24089         .addRegMask(RI.getNoPreservedMask());
24090   }
24091 
24092   unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24093   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
24094                     4);
24095   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
24096       .addReg(IReg)
24097       .addImm(LPadList.size());
24098   BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
24099 
24100   unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24101   BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
24102       .addReg(IReg)
24103       .addImm(1);
24104   BuildMI(DispContBB, DL,
24105           TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
24106       .addReg(0)
24107       .addImm(Subtarget.is64Bit() ? 8 : 4)
24108       .addReg(JReg)
24109       .addJumpTableIndex(MJTI)
24110       .addReg(0);
24111 
24112   // Add the jump table entries as successors to the MBB.
24113   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
24114   for (auto &LP : LPadList)
24115     if (SeenMBBs.insert(LP).second)
24116       DispContBB->addSuccessor(LP);
24117 
24118   // N.B. the order the invoke BBs are processed in doesn't matter here.
24119   SmallVector<MachineBasicBlock *, 64> MBBLPads;
24120   const MCPhysReg *SavedRegs =
24121       Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
24122   for (MachineBasicBlock *MBB : InvokeBBs) {
24123     // Remove the landing pad successor from the invoke block and replace it
24124     // with the new dispatch block.
24125     // Keep a copy of Successors since it's modified inside the loop.
24126     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
24127                                                    MBB->succ_rend());
24128     // FIXME: Avoid quadratic complexity.
24129     for (auto MBBS : Successors) {
24130       if (MBBS->isEHPad()) {
24131         MBB->removeSuccessor(MBBS);
24132         MBBLPads.push_back(MBBS);
24133       }
24134     }
24135 
24136     MBB->addSuccessor(DispatchBB);
24137 
24138     // Find the invoke call and mark all of the callee-saved registers as
24139     // 'implicit defined' so that they're spilled.  This prevents code from
24140     // moving instructions to before the EH block, where they will never be
24141     // executed.
24142     for (auto &II : reverse(*MBB)) {
24143       if (!II.isCall())
24144         continue;
24145 
24146       DenseMap<unsigned, bool> DefRegs;
24147       for (auto &MOp : II.operands())
24148         if (MOp.isReg())
24149           DefRegs[MOp.getReg()] = true;
24150 
24151       MachineInstrBuilder MIB(*MF, &II);
24152       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
24153         unsigned Reg = SavedRegs[RI];
24154         if (!DefRegs[Reg])
24155           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
24156       }
24157 
24158       break;
24159     }
24160   }
24161 
24162   // Mark all former landing pads as non-landing pads.  The dispatch is the only
24163   // landing pad now.
24164   for (auto &LP : MBBLPads)
24165     LP->setIsEHPad(false);
24166 
24167   // The instruction is gone now.
24168   MI.eraseFromParent();
24169   return BB;
24170 }
24171 
24172 // Replace 213-type (isel default) FMA3 instructions with 231-type for
24173 // accumulator loops. Writing back to the accumulator allows the coalescer
24174 // to remove extra copies in the loop.
24175 // FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
24176 MachineBasicBlock *
emitFMA3Instr(MachineInstr & MI,MachineBasicBlock * MBB) const24177 X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
24178                                  MachineBasicBlock *MBB) const {
24179   MachineOperand &AddendOp = MI.getOperand(3);
24180 
24181   // Bail out early if the addend isn't a register - we can't switch these.
24182   if (!AddendOp.isReg())
24183     return MBB;
24184 
24185   MachineFunction &MF = *MBB->getParent();
24186   MachineRegisterInfo &MRI = MF.getRegInfo();
24187 
24188   // Check whether the addend is defined by a PHI:
24189   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
24190   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
24191   if (!AddendDef.isPHI())
24192     return MBB;
24193 
24194   // Look for the following pattern:
24195   // loop:
24196   //   %addend = phi [%entry, 0], [%loop, %result]
24197   //   ...
24198   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
24199 
24200   // Replace with:
24201   //   loop:
24202   //   %addend = phi [%entry, 0], [%loop, %result]
24203   //   ...
24204   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
24205 
24206   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
24207     assert(AddendDef.getOperand(i).isReg());
24208     MachineOperand PHISrcOp = AddendDef.getOperand(i);
24209     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
24210     if (&PHISrcInst == &MI) {
24211       // Found a matching instruction.
24212       unsigned NewFMAOpc = 0;
24213       switch (MI.getOpcode()) {
24214       case X86::VFMADDPDr213r:
24215         NewFMAOpc = X86::VFMADDPDr231r;
24216         break;
24217       case X86::VFMADDPSr213r:
24218         NewFMAOpc = X86::VFMADDPSr231r;
24219         break;
24220       case X86::VFMADDSDr213r:
24221         NewFMAOpc = X86::VFMADDSDr231r;
24222         break;
24223       case X86::VFMADDSSr213r:
24224         NewFMAOpc = X86::VFMADDSSr231r;
24225         break;
24226       case X86::VFMSUBPDr213r:
24227         NewFMAOpc = X86::VFMSUBPDr231r;
24228         break;
24229       case X86::VFMSUBPSr213r:
24230         NewFMAOpc = X86::VFMSUBPSr231r;
24231         break;
24232       case X86::VFMSUBSDr213r:
24233         NewFMAOpc = X86::VFMSUBSDr231r;
24234         break;
24235       case X86::VFMSUBSSr213r:
24236         NewFMAOpc = X86::VFMSUBSSr231r;
24237         break;
24238       case X86::VFNMADDPDr213r:
24239         NewFMAOpc = X86::VFNMADDPDr231r;
24240         break;
24241       case X86::VFNMADDPSr213r:
24242         NewFMAOpc = X86::VFNMADDPSr231r;
24243         break;
24244       case X86::VFNMADDSDr213r:
24245         NewFMAOpc = X86::VFNMADDSDr231r;
24246         break;
24247       case X86::VFNMADDSSr213r:
24248         NewFMAOpc = X86::VFNMADDSSr231r;
24249         break;
24250       case X86::VFNMSUBPDr213r:
24251         NewFMAOpc = X86::VFNMSUBPDr231r;
24252         break;
24253       case X86::VFNMSUBPSr213r:
24254         NewFMAOpc = X86::VFNMSUBPSr231r;
24255         break;
24256       case X86::VFNMSUBSDr213r:
24257         NewFMAOpc = X86::VFNMSUBSDr231r;
24258         break;
24259       case X86::VFNMSUBSSr213r:
24260         NewFMAOpc = X86::VFNMSUBSSr231r;
24261         break;
24262       case X86::VFMADDSUBPDr213r:
24263         NewFMAOpc = X86::VFMADDSUBPDr231r;
24264         break;
24265       case X86::VFMADDSUBPSr213r:
24266         NewFMAOpc = X86::VFMADDSUBPSr231r;
24267         break;
24268       case X86::VFMSUBADDPDr213r:
24269         NewFMAOpc = X86::VFMSUBADDPDr231r;
24270         break;
24271       case X86::VFMSUBADDPSr213r:
24272         NewFMAOpc = X86::VFMSUBADDPSr231r;
24273         break;
24274 
24275       case X86::VFMADDPDr213rY:
24276         NewFMAOpc = X86::VFMADDPDr231rY;
24277         break;
24278       case X86::VFMADDPSr213rY:
24279         NewFMAOpc = X86::VFMADDPSr231rY;
24280         break;
24281       case X86::VFMSUBPDr213rY:
24282         NewFMAOpc = X86::VFMSUBPDr231rY;
24283         break;
24284       case X86::VFMSUBPSr213rY:
24285         NewFMAOpc = X86::VFMSUBPSr231rY;
24286         break;
24287       case X86::VFNMADDPDr213rY:
24288         NewFMAOpc = X86::VFNMADDPDr231rY;
24289         break;
24290       case X86::VFNMADDPSr213rY:
24291         NewFMAOpc = X86::VFNMADDPSr231rY;
24292         break;
24293       case X86::VFNMSUBPDr213rY:
24294         NewFMAOpc = X86::VFNMSUBPDr231rY;
24295         break;
24296       case X86::VFNMSUBPSr213rY:
24297         NewFMAOpc = X86::VFNMSUBPSr231rY;
24298         break;
24299       case X86::VFMADDSUBPDr213rY:
24300         NewFMAOpc = X86::VFMADDSUBPDr231rY;
24301         break;
24302       case X86::VFMADDSUBPSr213rY:
24303         NewFMAOpc = X86::VFMADDSUBPSr231rY;
24304         break;
24305       case X86::VFMSUBADDPDr213rY:
24306         NewFMAOpc = X86::VFMSUBADDPDr231rY;
24307         break;
24308       case X86::VFMSUBADDPSr213rY:
24309         NewFMAOpc = X86::VFMSUBADDPSr231rY;
24310         break;
24311       default:
24312         llvm_unreachable("Unrecognized FMA variant.");
24313       }
24314 
24315       const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
24316       MachineInstrBuilder MIB =
24317           BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
24318               .addOperand(MI.getOperand(0))
24319               .addOperand(MI.getOperand(3))
24320               .addOperand(MI.getOperand(2))
24321               .addOperand(MI.getOperand(1));
24322       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
24323       MI.eraseFromParent();
24324     }
24325   }
24326 
24327   return MBB;
24328 }
24329 
24330 MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const24331 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
24332                                                MachineBasicBlock *BB) const {
24333   switch (MI.getOpcode()) {
24334   default: llvm_unreachable("Unexpected instr type to insert");
24335   case X86::TAILJMPd64:
24336   case X86::TAILJMPr64:
24337   case X86::TAILJMPm64:
24338   case X86::TAILJMPd64_REX:
24339   case X86::TAILJMPr64_REX:
24340   case X86::TAILJMPm64_REX:
24341     llvm_unreachable("TAILJMP64 would not be touched here.");
24342   case X86::TCRETURNdi64:
24343   case X86::TCRETURNri64:
24344   case X86::TCRETURNmi64:
24345     return BB;
24346   case X86::TLS_addr32:
24347   case X86::TLS_addr64:
24348   case X86::TLS_base_addr32:
24349   case X86::TLS_base_addr64:
24350     return EmitLoweredTLSAddr(MI, BB);
24351   case X86::CATCHRET:
24352     return EmitLoweredCatchRet(MI, BB);
24353   case X86::CATCHPAD:
24354     return EmitLoweredCatchPad(MI, BB);
24355   case X86::SEG_ALLOCA_32:
24356   case X86::SEG_ALLOCA_64:
24357     return EmitLoweredSegAlloca(MI, BB);
24358   case X86::TLSCall_32:
24359   case X86::TLSCall_64:
24360     return EmitLoweredTLSCall(MI, BB);
24361   case X86::CMOV_FR32:
24362   case X86::CMOV_FR64:
24363   case X86::CMOV_FR128:
24364   case X86::CMOV_GR8:
24365   case X86::CMOV_GR16:
24366   case X86::CMOV_GR32:
24367   case X86::CMOV_RFP32:
24368   case X86::CMOV_RFP64:
24369   case X86::CMOV_RFP80:
24370   case X86::CMOV_V2F64:
24371   case X86::CMOV_V2I64:
24372   case X86::CMOV_V4F32:
24373   case X86::CMOV_V4F64:
24374   case X86::CMOV_V4I64:
24375   case X86::CMOV_V16F32:
24376   case X86::CMOV_V8F32:
24377   case X86::CMOV_V8F64:
24378   case X86::CMOV_V8I64:
24379   case X86::CMOV_V8I1:
24380   case X86::CMOV_V16I1:
24381   case X86::CMOV_V32I1:
24382   case X86::CMOV_V64I1:
24383     return EmitLoweredSelect(MI, BB);
24384 
24385   case X86::RDFLAGS32:
24386   case X86::RDFLAGS64: {
24387     DebugLoc DL = MI.getDebugLoc();
24388     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24389     unsigned PushF =
24390         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
24391     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
24392     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
24393     // Permit reads of the FLAGS register without it being defined.
24394     // This intrinsic exists to read external processor state in flags, such as
24395     // the trap flag, interrupt flag, and direction flag, none of which are
24396     // modeled by the backend.
24397     Push->getOperand(2).setIsUndef();
24398     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
24399 
24400     MI.eraseFromParent(); // The pseudo is gone now.
24401     return BB;
24402   }
24403 
24404   case X86::WRFLAGS32:
24405   case X86::WRFLAGS64: {
24406     DebugLoc DL = MI.getDebugLoc();
24407     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24408     unsigned Push =
24409         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
24410     unsigned PopF =
24411         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
24412     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
24413     BuildMI(*BB, MI, DL, TII->get(PopF));
24414 
24415     MI.eraseFromParent(); // The pseudo is gone now.
24416     return BB;
24417   }
24418 
24419   case X86::RELEASE_FADD32mr:
24420   case X86::RELEASE_FADD64mr:
24421     return EmitLoweredAtomicFP(MI, BB);
24422 
24423   case X86::FP32_TO_INT16_IN_MEM:
24424   case X86::FP32_TO_INT32_IN_MEM:
24425   case X86::FP32_TO_INT64_IN_MEM:
24426   case X86::FP64_TO_INT16_IN_MEM:
24427   case X86::FP64_TO_INT32_IN_MEM:
24428   case X86::FP64_TO_INT64_IN_MEM:
24429   case X86::FP80_TO_INT16_IN_MEM:
24430   case X86::FP80_TO_INT32_IN_MEM:
24431   case X86::FP80_TO_INT64_IN_MEM: {
24432     MachineFunction *F = BB->getParent();
24433     const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24434     DebugLoc DL = MI.getDebugLoc();
24435 
24436     // Change the floating point control register to use "round towards zero"
24437     // mode when truncating to an integer value.
24438     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
24439     addFrameReference(BuildMI(*BB, MI, DL,
24440                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
24441 
24442     // Load the old value of the high byte of the control word...
24443     unsigned OldCW =
24444       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
24445     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
24446                       CWFrameIdx);
24447 
24448     // Set the high part to be round to zero...
24449     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
24450       .addImm(0xC7F);
24451 
24452     // Reload the modified control word now...
24453     addFrameReference(BuildMI(*BB, MI, DL,
24454                               TII->get(X86::FLDCW16m)), CWFrameIdx);
24455 
24456     // Restore the memory image of control word to original value
24457     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
24458       .addReg(OldCW);
24459 
24460     // Get the X86 opcode to use.
24461     unsigned Opc;
24462     switch (MI.getOpcode()) {
24463     default: llvm_unreachable("illegal opcode!");
24464     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
24465     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
24466     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
24467     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
24468     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
24469     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
24470     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
24471     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
24472     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
24473     }
24474 
24475     X86AddressMode AM = getAddressFromInstr(&MI, 0);
24476     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
24477         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
24478 
24479     // Reload the original control word now.
24480     addFrameReference(BuildMI(*BB, MI, DL,
24481                               TII->get(X86::FLDCW16m)), CWFrameIdx);
24482 
24483     MI.eraseFromParent(); // The pseudo instruction is gone now.
24484     return BB;
24485   }
24486     // String/text processing lowering.
24487   case X86::PCMPISTRM128REG:
24488   case X86::VPCMPISTRM128REG:
24489   case X86::PCMPISTRM128MEM:
24490   case X86::VPCMPISTRM128MEM:
24491   case X86::PCMPESTRM128REG:
24492   case X86::VPCMPESTRM128REG:
24493   case X86::PCMPESTRM128MEM:
24494   case X86::VPCMPESTRM128MEM:
24495     assert(Subtarget.hasSSE42() &&
24496            "Target must have SSE4.2 or AVX features enabled");
24497     return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
24498 
24499   // String/text processing lowering.
24500   case X86::PCMPISTRIREG:
24501   case X86::VPCMPISTRIREG:
24502   case X86::PCMPISTRIMEM:
24503   case X86::VPCMPISTRIMEM:
24504   case X86::PCMPESTRIREG:
24505   case X86::VPCMPESTRIREG:
24506   case X86::PCMPESTRIMEM:
24507   case X86::VPCMPESTRIMEM:
24508     assert(Subtarget.hasSSE42() &&
24509            "Target must have SSE4.2 or AVX features enabled");
24510     return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
24511 
24512   // Thread synchronization.
24513   case X86::MONITOR:
24514     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
24515   case X86::MONITORX:
24516     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
24517   // PKU feature
24518   case X86::WRPKRU:
24519     return emitWRPKRU(MI, BB, Subtarget);
24520   case X86::RDPKRU:
24521     return emitRDPKRU(MI, BB, Subtarget);
24522   // xbegin
24523   case X86::XBEGIN:
24524     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
24525 
24526   case X86::VASTART_SAVE_XMM_REGS:
24527     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
24528 
24529   case X86::VAARG_64:
24530     return EmitVAARG64WithCustomInserter(MI, BB);
24531 
24532   case X86::EH_SjLj_SetJmp32:
24533   case X86::EH_SjLj_SetJmp64:
24534     return emitEHSjLjSetJmp(MI, BB);
24535 
24536   case X86::EH_SjLj_LongJmp32:
24537   case X86::EH_SjLj_LongJmp64:
24538     return emitEHSjLjLongJmp(MI, BB);
24539 
24540   case X86::Int_eh_sjlj_setup_dispatch:
24541     return EmitSjLjDispatchBlock(MI, BB);
24542 
24543   case TargetOpcode::STATEPOINT:
24544     // As an implementation detail, STATEPOINT shares the STACKMAP format at
24545     // this point in the process.  We diverge later.
24546     return emitPatchPoint(MI, BB);
24547 
24548   case TargetOpcode::STACKMAP:
24549   case TargetOpcode::PATCHPOINT:
24550     return emitPatchPoint(MI, BB);
24551 
24552   case X86::VFMADDPDr213r:
24553   case X86::VFMADDPSr213r:
24554   case X86::VFMADDSDr213r:
24555   case X86::VFMADDSSr213r:
24556   case X86::VFMSUBPDr213r:
24557   case X86::VFMSUBPSr213r:
24558   case X86::VFMSUBSDr213r:
24559   case X86::VFMSUBSSr213r:
24560   case X86::VFNMADDPDr213r:
24561   case X86::VFNMADDPSr213r:
24562   case X86::VFNMADDSDr213r:
24563   case X86::VFNMADDSSr213r:
24564   case X86::VFNMSUBPDr213r:
24565   case X86::VFNMSUBPSr213r:
24566   case X86::VFNMSUBSDr213r:
24567   case X86::VFNMSUBSSr213r:
24568   case X86::VFMADDSUBPDr213r:
24569   case X86::VFMADDSUBPSr213r:
24570   case X86::VFMSUBADDPDr213r:
24571   case X86::VFMSUBADDPSr213r:
24572   case X86::VFMADDPDr213rY:
24573   case X86::VFMADDPSr213rY:
24574   case X86::VFMSUBPDr213rY:
24575   case X86::VFMSUBPSr213rY:
24576   case X86::VFNMADDPDr213rY:
24577   case X86::VFNMADDPSr213rY:
24578   case X86::VFNMSUBPDr213rY:
24579   case X86::VFNMSUBPSr213rY:
24580   case X86::VFMADDSUBPDr213rY:
24581   case X86::VFMADDSUBPSr213rY:
24582   case X86::VFMSUBADDPDr213rY:
24583   case X86::VFMSUBADDPSr213rY:
24584     return emitFMA3Instr(MI, BB);
24585   case X86::LCMPXCHG8B_SAVE_EBX:
24586   case X86::LCMPXCHG16B_SAVE_RBX: {
24587     unsigned BasePtr =
24588         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
24589     if (!BB->isLiveIn(BasePtr))
24590       BB->addLiveIn(BasePtr);
24591     return BB;
24592   }
24593   }
24594 }
24595 
24596 //===----------------------------------------------------------------------===//
24597 //                           X86 Optimization Hooks
24598 //===----------------------------------------------------------------------===//
24599 
computeKnownBitsForTargetNode(const SDValue Op,APInt & KnownZero,APInt & KnownOne,const SelectionDAG & DAG,unsigned Depth) const24600 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
24601                                                       APInt &KnownZero,
24602                                                       APInt &KnownOne,
24603                                                       const SelectionDAG &DAG,
24604                                                       unsigned Depth) const {
24605   unsigned BitWidth = KnownZero.getBitWidth();
24606   unsigned Opc = Op.getOpcode();
24607   assert((Opc >= ISD::BUILTIN_OP_END ||
24608           Opc == ISD::INTRINSIC_WO_CHAIN ||
24609           Opc == ISD::INTRINSIC_W_CHAIN ||
24610           Opc == ISD::INTRINSIC_VOID) &&
24611          "Should use MaskedValueIsZero if you don't know whether Op"
24612          " is a target node!");
24613 
24614   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
24615   switch (Opc) {
24616   default: break;
24617   case X86ISD::ADD:
24618   case X86ISD::SUB:
24619   case X86ISD::ADC:
24620   case X86ISD::SBB:
24621   case X86ISD::SMUL:
24622   case X86ISD::UMUL:
24623   case X86ISD::INC:
24624   case X86ISD::DEC:
24625   case X86ISD::OR:
24626   case X86ISD::XOR:
24627   case X86ISD::AND:
24628     // These nodes' second result is a boolean.
24629     if (Op.getResNo() == 0)
24630       break;
24631     // Fallthrough
24632   case X86ISD::SETCC:
24633     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
24634     break;
24635   case X86ISD::MOVMSK: {
24636     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
24637     KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
24638     break;
24639   }
24640   }
24641 }
24642 
ComputeNumSignBitsForTargetNode(SDValue Op,const SelectionDAG &,unsigned Depth) const24643 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
24644   SDValue Op,
24645   const SelectionDAG &,
24646   unsigned Depth) const {
24647   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
24648   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
24649     return Op.getValueType().getScalarSizeInBits();
24650 
24651   // Fallback case.
24652   return 1;
24653 }
24654 
24655 /// Returns true (and the GlobalValue and the offset) if the node is a
24656 /// GlobalAddress + offset.
isGAPlusOffset(SDNode * N,const GlobalValue * & GA,int64_t & Offset) const24657 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
24658                                        const GlobalValue* &GA,
24659                                        int64_t &Offset) const {
24660   if (N->getOpcode() == X86ISD::Wrapper) {
24661     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
24662       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
24663       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
24664       return true;
24665     }
24666   }
24667   return TargetLowering::isGAPlusOffset(N, GA, Offset);
24668 }
24669 
24670 /// Performs shuffle combines for 256-bit vectors.
24671 /// FIXME: This could be expanded to support 512 bit vectors as well.
combineShuffle256(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)24672 static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
24673                                  TargetLowering::DAGCombinerInfo &DCI,
24674                                  const X86Subtarget &Subtarget) {
24675   SDLoc dl(N);
24676   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
24677   SDValue V1 = SVOp->getOperand(0);
24678   SDValue V2 = SVOp->getOperand(1);
24679   MVT VT = SVOp->getSimpleValueType(0);
24680   unsigned NumElems = VT.getVectorNumElements();
24681 
24682   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
24683       V2.getOpcode() == ISD::CONCAT_VECTORS) {
24684     //
24685     //                   0,0,0,...
24686     //                      |
24687     //    V      UNDEF    BUILD_VECTOR    UNDEF
24688     //     \      /           \           /
24689     //  CONCAT_VECTOR         CONCAT_VECTOR
24690     //         \                  /
24691     //          \                /
24692     //          RESULT: V + zero extended
24693     //
24694     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
24695         !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
24696       return SDValue();
24697 
24698     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
24699       return SDValue();
24700 
24701     // To match the shuffle mask, the first half of the mask should
24702     // be exactly the first vector, and all the rest a splat with the
24703     // first element of the second one.
24704     for (unsigned i = 0; i != NumElems/2; ++i)
24705       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
24706           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
24707         return SDValue();
24708 
24709     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
24710     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
24711       if (Ld->hasNUsesOfValue(1, 0)) {
24712         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
24713         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
24714         SDValue ResNode =
24715           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
24716                                   Ld->getMemoryVT(),
24717                                   Ld->getPointerInfo(),
24718                                   Ld->getAlignment(),
24719                                   false/*isVolatile*/, true/*ReadMem*/,
24720                                   false/*WriteMem*/);
24721 
24722         // Make sure the newly-created LOAD is in the same position as Ld in
24723         // terms of dependency. We create a TokenFactor for Ld and ResNode,
24724         // and update uses of Ld's output chain to use the TokenFactor.
24725         if (Ld->hasAnyUseOfValue(1)) {
24726           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24727                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
24728           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
24729           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
24730                                  SDValue(ResNode.getNode(), 1));
24731         }
24732 
24733         return DAG.getBitcast(VT, ResNode);
24734       }
24735     }
24736 
24737     // Emit a zeroed vector and insert the desired subvector on its
24738     // first half.
24739     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
24740     SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
24741     return DCI.CombineTo(N, InsV);
24742   }
24743 
24744   return SDValue();
24745 }
24746 
24747 // Attempt to match a combined shuffle mask against supported unary shuffle
24748 // instructions.
24749 // TODO: Investigate sharing more of this with shuffle lowering.
matchUnaryVectorShuffle(MVT SrcVT,ArrayRef<int> Mask,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT)24750 static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24751                                     const X86Subtarget &Subtarget,
24752                                     unsigned &Shuffle, MVT &ShuffleVT) {
24753   bool FloatDomain = SrcVT.isFloatingPoint() ||
24754                      (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
24755 
24756   // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
24757   if (!FloatDomain && SrcVT.is128BitVector() &&
24758       isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
24759     Shuffle = X86ISD::VZEXT_MOVL;
24760     ShuffleVT = MVT::v2i64;
24761     return true;
24762   }
24763 
24764   // Check if we have SSE3 which will let us use MOVDDUP etc. The
24765   // instructions are no slower than UNPCKLPD but has the option to
24766   // fold the input operand into even an unaligned memory load.
24767   if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
24768     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
24769       Shuffle = X86ISD::MOVDDUP;
24770       ShuffleVT = MVT::v2f64;
24771       return true;
24772     }
24773     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24774       Shuffle = X86ISD::MOVSLDUP;
24775       ShuffleVT = MVT::v4f32;
24776       return true;
24777     }
24778     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
24779       Shuffle = X86ISD::MOVSHDUP;
24780       ShuffleVT = MVT::v4f32;
24781       return true;
24782     }
24783   }
24784 
24785   if (SrcVT.is256BitVector() && FloatDomain) {
24786     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
24787     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24788       Shuffle = X86ISD::MOVDDUP;
24789       ShuffleVT = MVT::v4f64;
24790       return true;
24791     }
24792     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24793       Shuffle = X86ISD::MOVSLDUP;
24794       ShuffleVT = MVT::v8f32;
24795       return true;
24796     }
24797     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
24798       Shuffle = X86ISD::MOVSHDUP;
24799       ShuffleVT = MVT::v8f32;
24800       return true;
24801     }
24802   }
24803 
24804   if (SrcVT.is512BitVector() && FloatDomain) {
24805     assert(Subtarget.hasAVX512() &&
24806            "AVX512 required for 512-bit vector shuffles");
24807     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24808       Shuffle = X86ISD::MOVDDUP;
24809       ShuffleVT = MVT::v8f64;
24810       return true;
24811     }
24812     if (isTargetShuffleEquivalent(
24813             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
24814       Shuffle = X86ISD::MOVSLDUP;
24815       ShuffleVT = MVT::v16f32;
24816       return true;
24817     }
24818     if (isTargetShuffleEquivalent(
24819             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
24820       Shuffle = X86ISD::MOVSHDUP;
24821       ShuffleVT = MVT::v16f32;
24822       return true;
24823     }
24824   }
24825 
24826   // Attempt to match against broadcast-from-vector.
24827   if (Subtarget.hasAVX2()) {
24828     unsigned NumElts = Mask.size();
24829     SmallVector<int, 64> BroadcastMask(NumElts, 0);
24830     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
24831       unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
24832       ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
24833                               : MVT::getIntegerVT(EltSize);
24834       ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
24835       Shuffle = X86ISD::VBROADCAST;
24836       return true;
24837     }
24838   }
24839 
24840   return false;
24841 }
24842 
24843 // Attempt to match a combined shuffle mask against supported unary immediate
24844 // permute instructions.
24845 // TODO: Investigate sharing more of this with shuffle lowering.
matchPermuteVectorShuffle(MVT SrcVT,ArrayRef<int> Mask,const X86Subtarget & Subtarget,unsigned & Shuffle,MVT & ShuffleVT,unsigned & PermuteImm)24846 static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24847                                       const X86Subtarget &Subtarget,
24848                                       unsigned &Shuffle, MVT &ShuffleVT,
24849                                       unsigned &PermuteImm) {
24850   // Ensure we don't contain any zero elements.
24851   for (int M : Mask) {
24852     if (M == SM_SentinelZero)
24853       return false;
24854     assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
24855            "Expected unary shuffle");
24856   }
24857 
24858   unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
24859   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
24860 
24861   // Handle PSHUFLW/PSHUFHW repeated patterns.
24862   if (MaskScalarSizeInBits == 16) {
24863     SmallVector<int, 4> RepeatedMask;
24864     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
24865       ArrayRef<int> LoMask(Mask.data() + 0, 4);
24866       ArrayRef<int> HiMask(Mask.data() + 4, 4);
24867 
24868       // PSHUFLW: permute lower 4 elements only.
24869       if (isUndefOrInRange(LoMask, 0, 4) &&
24870           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
24871         Shuffle = X86ISD::PSHUFLW;
24872         ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24873         PermuteImm = getV4X86ShuffleImm(LoMask);
24874         return true;
24875       }
24876 
24877       // PSHUFHW: permute upper 4 elements only.
24878       if (isUndefOrInRange(HiMask, 4, 8) &&
24879           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
24880         // Offset the HiMask so that we can create the shuffle immediate.
24881         int OffsetHiMask[4];
24882         for (int i = 0; i != 4; ++i)
24883           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
24884 
24885         Shuffle = X86ISD::PSHUFHW;
24886         ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24887         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
24888         return true;
24889       }
24890 
24891       return false;
24892     }
24893     return false;
24894   }
24895 
24896   // We only support permutation of 32/64 bit elements after this.
24897   if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
24898     return false;
24899 
24900   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
24901   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
24902   bool FloatDomain = SrcVT.isFloatingPoint();
24903   if (FloatDomain && !Subtarget.hasAVX())
24904     return false;
24905 
24906   // Pre-AVX2 we must use float shuffles on 256-bit vectors.
24907   if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
24908     FloatDomain = true;
24909 
24910   // Check for lane crossing permutes.
24911   if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
24912     // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
24913     if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
24914       Shuffle = X86ISD::VPERMI;
24915       ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
24916       PermuteImm = getV4X86ShuffleImm(Mask);
24917       return true;
24918     }
24919     if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
24920       SmallVector<int, 4> RepeatedMask;
24921       if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
24922         Shuffle = X86ISD::VPERMI;
24923         ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
24924         PermuteImm = getV4X86ShuffleImm(RepeatedMask);
24925         return true;
24926       }
24927     }
24928     return false;
24929   }
24930 
24931   // VPERMILPD can permute with a non-repeating shuffle.
24932   if (FloatDomain && MaskScalarSizeInBits == 64) {
24933     Shuffle = X86ISD::VPERMILPI;
24934     ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
24935     PermuteImm = 0;
24936     for (int i = 0, e = Mask.size(); i != e; ++i) {
24937       int M = Mask[i];
24938       if (M == SM_SentinelUndef)
24939         continue;
24940       assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
24941       PermuteImm |= (M & 1) << i;
24942     }
24943     return true;
24944   }
24945 
24946   // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
24947   SmallVector<int, 4> RepeatedMask;
24948   if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
24949     return false;
24950 
24951   // Narrow the repeated mask for 32-bit element permutes.
24952   SmallVector<int, 4> WordMask = RepeatedMask;
24953   if (MaskScalarSizeInBits == 64)
24954     scaleShuffleMask(2, RepeatedMask, WordMask);
24955 
24956   Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
24957   ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
24958   ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
24959   PermuteImm = getV4X86ShuffleImm(WordMask);
24960   return true;
24961 }
24962 
24963 // Attempt to match a combined unary shuffle mask against supported binary
24964 // shuffle instructions.
24965 // TODO: Investigate sharing more of this with shuffle lowering.
matchBinaryVectorShuffle(MVT SrcVT,ArrayRef<int> Mask,unsigned & Shuffle,MVT & ShuffleVT)24966 static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24967                                      unsigned &Shuffle, MVT &ShuffleVT) {
24968   bool FloatDomain = SrcVT.isFloatingPoint();
24969 
24970   if (SrcVT.is128BitVector()) {
24971     if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
24972       Shuffle = X86ISD::MOVLHPS;
24973       ShuffleVT = MVT::v4f32;
24974       return true;
24975     }
24976     if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
24977       Shuffle = X86ISD::MOVHLPS;
24978       ShuffleVT = MVT::v4f32;
24979       return true;
24980     }
24981     if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
24982       Shuffle = X86ISD::UNPCKL;
24983       ShuffleVT = MVT::v4f32;
24984       return true;
24985     }
24986     if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
24987       Shuffle = X86ISD::UNPCKH;
24988       ShuffleVT = MVT::v4f32;
24989       return true;
24990     }
24991     if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
24992         isTargetShuffleEquivalent(
24993             Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
24994       Shuffle = X86ISD::UNPCKL;
24995       ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
24996       return true;
24997     }
24998     if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
24999         isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
25000                                          13, 14, 14, 15, 15})) {
25001       Shuffle = X86ISD::UNPCKH;
25002       ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
25003       return true;
25004     }
25005   }
25006 
25007   return false;
25008 }
25009 
25010 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
25011 /// possible.
25012 ///
25013 /// This is the leaf of the recursive combine below. When we have found some
25014 /// chain of single-use x86 shuffle instructions and accumulated the combined
25015 /// shuffle mask represented by them, this will try to pattern match that mask
25016 /// into either a single instruction if there is a special purpose instruction
25017 /// for this operation, or into a PSHUFB instruction which is a fully general
25018 /// instruction but should only be used to replace chains over a certain depth.
combineX86ShuffleChain(SDValue Input,SDValue Root,ArrayRef<int> BaseMask,int Depth,bool HasVariableMask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)25019 static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
25020                                    ArrayRef<int> BaseMask, int Depth,
25021                                    bool HasVariableMask, SelectionDAG &DAG,
25022                                    TargetLowering::DAGCombinerInfo &DCI,
25023                                    const X86Subtarget &Subtarget) {
25024   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
25025 
25026   // Find the operand that enters the chain. Note that multiple uses are OK
25027   // here, we're not going to remove the operand we find.
25028   Input = peekThroughBitcasts(Input);
25029 
25030   MVT VT = Input.getSimpleValueType();
25031   MVT RootVT = Root.getSimpleValueType();
25032   SDLoc DL(Root);
25033 
25034   SDValue Res;
25035 
25036   unsigned NumBaseMaskElts = BaseMask.size();
25037   if (NumBaseMaskElts == 1) {
25038     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
25039     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
25040                   /*AddTo*/ true);
25041     return true;
25042   }
25043 
25044   unsigned RootSizeInBits = RootVT.getSizeInBits();
25045   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
25046 
25047   // Don't combine if we are a AVX512/EVEX target and the mask element size
25048   // is different from the root element size - this would prevent writemasks
25049   // from being reused.
25050   // TODO - this currently prevents all lane shuffles from occurring.
25051   // TODO - check for writemasks usage instead of always preventing combining.
25052   // TODO - attempt to narrow Mask back to writemask size.
25053   if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
25054       (RootSizeInBits == 512 ||
25055        (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
25056     return false;
25057   }
25058 
25059   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
25060 
25061   // Handle 128-bit lane shuffles of 256-bit vectors.
25062   if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
25063       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
25064     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
25065       return false; // Nothing to do!
25066     MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
25067                                                                   : MVT::v4i64);
25068     unsigned PermMask = 0;
25069     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
25070     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
25071 
25072     Res = DAG.getBitcast(ShuffleVT, Input);
25073     DCI.AddToWorklist(Res.getNode());
25074     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
25075                       DAG.getUNDEF(ShuffleVT),
25076                       DAG.getConstant(PermMask, DL, MVT::i8));
25077     DCI.AddToWorklist(Res.getNode());
25078     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25079                   /*AddTo*/ true);
25080     return true;
25081   }
25082 
25083   // For masks that have been widened to 128-bit elements or more,
25084   // narrow back down to 64-bit elements.
25085   SmallVector<int, 64> Mask;
25086   if (BaseMaskEltSizeInBits > 64) {
25087     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
25088     int MaskScale = BaseMaskEltSizeInBits / 64;
25089     scaleShuffleMask(MaskScale, BaseMask, Mask);
25090   } else {
25091     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
25092   }
25093 
25094   unsigned NumMaskElts = Mask.size();
25095   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
25096 
25097   // Determine the effective mask value type.
25098   bool FloatDomain =
25099       (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
25100       (32 <= MaskEltSizeInBits);
25101   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
25102                            : MVT::getIntegerVT(MaskEltSizeInBits);
25103   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
25104 
25105   // Attempt to match the mask against known shuffle patterns.
25106   MVT ShuffleVT;
25107   unsigned Shuffle, PermuteImm;
25108 
25109   if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
25110     if (Depth == 1 && Root.getOpcode() == Shuffle)
25111       return false; // Nothing to do!
25112     Res = DAG.getBitcast(ShuffleVT, Input);
25113     DCI.AddToWorklist(Res.getNode());
25114     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
25115     DCI.AddToWorklist(Res.getNode());
25116     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25117                   /*AddTo*/ true);
25118     return true;
25119   }
25120 
25121   if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
25122                                 PermuteImm)) {
25123     if (Depth == 1 && Root.getOpcode() == Shuffle)
25124       return false; // Nothing to do!
25125     Res = DAG.getBitcast(ShuffleVT, Input);
25126     DCI.AddToWorklist(Res.getNode());
25127     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
25128                       DAG.getConstant(PermuteImm, DL, MVT::i8));
25129     DCI.AddToWorklist(Res.getNode());
25130     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25131                   /*AddTo*/ true);
25132     return true;
25133   }
25134 
25135   if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
25136     if (Depth == 1 && Root.getOpcode() == Shuffle)
25137       return false; // Nothing to do!
25138     Res = DAG.getBitcast(ShuffleVT, Input);
25139     DCI.AddToWorklist(Res.getNode());
25140     Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
25141     DCI.AddToWorklist(Res.getNode());
25142     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25143                   /*AddTo*/ true);
25144     return true;
25145   }
25146 
25147   // Attempt to blend with zero.
25148   if (NumMaskElts <= 8 &&
25149       ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
25150        (Subtarget.hasAVX() && VT.is256BitVector()))) {
25151     // Convert VT to a type compatible with X86ISD::BLENDI.
25152     // TODO - add 16i16 support (requires lane duplication).
25153     MVT ShuffleVT = MaskVT;
25154     if (Subtarget.hasAVX2()) {
25155       if (ShuffleVT == MVT::v4i64)
25156         ShuffleVT = MVT::v8i32;
25157       else if (ShuffleVT == MVT::v2i64)
25158         ShuffleVT = MVT::v4i32;
25159     } else {
25160       if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
25161         ShuffleVT = MVT::v8i16;
25162       else if (ShuffleVT == MVT::v4i64)
25163         ShuffleVT = MVT::v4f64;
25164       else if (ShuffleVT == MVT::v8i32)
25165         ShuffleVT = MVT::v8f32;
25166     }
25167 
25168     if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
25169                                          /*Low*/ 0) &&
25170         NumMaskElts <= ShuffleVT.getVectorNumElements()) {
25171       unsigned BlendMask = 0;
25172       unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
25173       unsigned MaskRatio = ShuffleSize / NumMaskElts;
25174 
25175       if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
25176         return false;
25177 
25178       for (unsigned i = 0; i != ShuffleSize; ++i)
25179         if (Mask[i / MaskRatio] < 0)
25180           BlendMask |= 1u << i;
25181 
25182       SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
25183       Res = DAG.getBitcast(ShuffleVT, Input);
25184       DCI.AddToWorklist(Res.getNode());
25185       Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
25186                         DAG.getConstant(BlendMask, DL, MVT::i8));
25187       DCI.AddToWorklist(Res.getNode());
25188       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25189                     /*AddTo*/ true);
25190       return true;
25191     }
25192   }
25193 
25194   // Attempt to combine to INSERTPS.
25195   if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
25196       (VT == MVT::v2f64 || VT == MVT::v4f32)) {
25197     SmallBitVector Zeroable(4, false);
25198     for (unsigned i = 0; i != NumMaskElts; ++i)
25199       if (Mask[i] < 0)
25200         Zeroable[i] = true;
25201 
25202     unsigned InsertPSMask;
25203     SDValue V1 = Input, V2 = Input;
25204     if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
25205                                                        Zeroable, Mask, DAG)) {
25206       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
25207         return false; // Nothing to do!
25208       V1 = DAG.getBitcast(MVT::v4f32, V1);
25209       DCI.AddToWorklist(V1.getNode());
25210       V2 = DAG.getBitcast(MVT::v4f32, V2);
25211       DCI.AddToWorklist(V2.getNode());
25212       Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
25213                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
25214       DCI.AddToWorklist(Res.getNode());
25215       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25216                     /*AddTo*/ true);
25217       return true;
25218     }
25219   }
25220 
25221   // Don't try to re-form single instruction chains under any circumstances now
25222   // that we've done encoding canonicalization for them.
25223   if (Depth < 2)
25224     return false;
25225 
25226   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
25227     return false;
25228 
25229   bool MaskContainsZeros =
25230       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
25231 
25232   // If we have a single input shuffle with different shuffle patterns in the
25233   // the 128-bit lanes use the variable mask to VPERMILPS.
25234   // TODO Combine other mask types at higher depths.
25235   if (HasVariableMask && !MaskContainsZeros &&
25236       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
25237        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
25238     SmallVector<SDValue, 16> VPermIdx;
25239     for (int M : Mask) {
25240       SDValue Idx =
25241           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
25242       VPermIdx.push_back(Idx);
25243     }
25244     MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
25245     SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
25246     DCI.AddToWorklist(VPermMask.getNode());
25247     Res = DAG.getBitcast(MaskVT, Input);
25248     DCI.AddToWorklist(Res.getNode());
25249     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
25250     DCI.AddToWorklist(Res.getNode());
25251     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25252                   /*AddTo*/ true);
25253     return true;
25254   }
25255 
25256   // If we have 3 or more shuffle instructions or a chain involving a variable
25257   // mask, we can replace them with a single PSHUFB instruction profitably.
25258   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
25259   // instructions, but in practice PSHUFB tends to be *very* fast so we're
25260   // more aggressive.
25261   if ((Depth >= 3 || HasVariableMask) &&
25262       ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
25263        (VT.is256BitVector() && Subtarget.hasAVX2()) ||
25264        (VT.is512BitVector() && Subtarget.hasBWI()))) {
25265     SmallVector<SDValue, 16> PSHUFBMask;
25266     int NumBytes = VT.getSizeInBits() / 8;
25267     int Ratio = NumBytes / NumMaskElts;
25268     for (int i = 0; i < NumBytes; ++i) {
25269       int M = Mask[i / Ratio];
25270       if (M == SM_SentinelUndef) {
25271         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
25272         continue;
25273       }
25274       if (M == SM_SentinelZero) {
25275         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
25276         continue;
25277       }
25278       M = Ratio * M + i % Ratio;
25279       assert ((M / 16) == (i / 16) && "Lane crossing detected");
25280       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
25281     }
25282     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
25283     Res = DAG.getBitcast(ByteVT, Input);
25284     DCI.AddToWorklist(Res.getNode());
25285     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
25286     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
25287     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
25288     DCI.AddToWorklist(Res.getNode());
25289     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25290                   /*AddTo*/ true);
25291     return true;
25292   }
25293 
25294   // Failed to find any combines.
25295   return false;
25296 }
25297 
25298 /// \brief Fully generic combining of x86 shuffle instructions.
25299 ///
25300 /// This should be the last combine run over the x86 shuffle instructions. Once
25301 /// they have been fully optimized, this will recursively consider all chains
25302 /// of single-use shuffle instructions, build a generic model of the cumulative
25303 /// shuffle operation, and check for simpler instructions which implement this
25304 /// operation. We use this primarily for two purposes:
25305 ///
25306 /// 1) Collapse generic shuffles to specialized single instructions when
25307 ///    equivalent. In most cases, this is just an encoding size win, but
25308 ///    sometimes we will collapse multiple generic shuffles into a single
25309 ///    special-purpose shuffle.
25310 /// 2) Look for sequences of shuffle instructions with 3 or more total
25311 ///    instructions, and replace them with the slightly more expensive SSSE3
25312 ///    PSHUFB instruction if available. We do this as the last combining step
25313 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
25314 ///    a suitable short sequence of other instructions. The PHUFB will either
25315 ///    use a register or have to read from memory and so is slightly (but only
25316 ///    slightly) more expensive than the other shuffle instructions.
25317 ///
25318 /// Because this is inherently a quadratic operation (for each shuffle in
25319 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
25320 /// This should never be an issue in practice as the shuffle lowering doesn't
25321 /// produce sequences of more than 8 instructions.
25322 ///
25323 /// FIXME: We will currently miss some cases where the redundant shuffling
25324 /// would simplify under the threshold for PSHUFB formation because of
25325 /// combine-ordering. To fix this, we should do the redundant instruction
25326 /// combining in this recursive walk.
combineX86ShufflesRecursively(SDValue Op,SDValue Root,ArrayRef<int> RootMask,int Depth,bool HasVariableMask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)25327 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
25328                                           ArrayRef<int> RootMask,
25329                                           int Depth, bool HasVariableMask,
25330                                           SelectionDAG &DAG,
25331                                           TargetLowering::DAGCombinerInfo &DCI,
25332                                           const X86Subtarget &Subtarget) {
25333   // Bound the depth of our recursive combine because this is ultimately
25334   // quadratic in nature.
25335   if (Depth > 8)
25336     return false;
25337 
25338   // Directly rip through bitcasts to find the underlying operand.
25339   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
25340     Op = Op.getOperand(0);
25341 
25342   MVT VT = Op.getSimpleValueType();
25343   if (!VT.isVector())
25344     return false; // Bail if we hit a non-vector.
25345 
25346   assert(Root.getSimpleValueType().isVector() &&
25347          "Shuffles operate on vector types!");
25348   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
25349          "Can only combine shuffles of the same vector register size.");
25350 
25351   // Extract target shuffle mask and resolve sentinels and inputs.
25352   SDValue Input0, Input1;
25353   SmallVector<int, 16> OpMask;
25354   if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
25355     return false;
25356 
25357   assert(VT.getVectorNumElements() == OpMask.size() &&
25358          "Different mask size from vector size!");
25359   assert(((RootMask.size() > OpMask.size() &&
25360            RootMask.size() % OpMask.size() == 0) ||
25361           (OpMask.size() > RootMask.size() &&
25362            OpMask.size() % RootMask.size() == 0) ||
25363           OpMask.size() == RootMask.size()) &&
25364          "The smaller number of elements must divide the larger.");
25365   int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
25366   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
25367   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
25368   assert(((RootRatio == 1 && OpRatio == 1) ||
25369           (RootRatio == 1) != (OpRatio == 1)) &&
25370          "Must not have a ratio for both incoming and op masks!");
25371 
25372   SmallVector<int, 16> Mask;
25373   Mask.reserve(MaskWidth);
25374 
25375   // Merge this shuffle operation's mask into our accumulated mask. Note that
25376   // this shuffle's mask will be the first applied to the input, followed by the
25377   // root mask to get us all the way to the root value arrangement. The reason
25378   // for this order is that we are recursing up the operation chain.
25379   for (int i = 0; i < MaskWidth; ++i) {
25380     int RootIdx = i / RootRatio;
25381     if (RootMask[RootIdx] < 0) {
25382       // This is a zero or undef lane, we're done.
25383       Mask.push_back(RootMask[RootIdx]);
25384       continue;
25385     }
25386 
25387     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
25388     int OpIdx = RootMaskedIdx / OpRatio;
25389     if (OpMask[OpIdx] < 0) {
25390       // The incoming lanes are zero or undef, it doesn't matter which ones we
25391       // are using.
25392       Mask.push_back(OpMask[OpIdx]);
25393       continue;
25394     }
25395 
25396     // Ok, we have non-zero lanes, map them through.
25397     Mask.push_back(OpMask[OpIdx] * OpRatio +
25398                    RootMaskedIdx % OpRatio);
25399   }
25400 
25401   // Handle the all undef/zero cases early.
25402   if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
25403     DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
25404     return true;
25405   }
25406   if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
25407     // TODO - should we handle the mixed zero/undef case as well? Just returning
25408     // a zero mask will lose information on undef elements possibly reducing
25409     // future combine possibilities.
25410     DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
25411                                                 Subtarget, DAG, SDLoc(Root)));
25412     return true;
25413   }
25414 
25415   int MaskSize = Mask.size();
25416   bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
25417                   [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
25418   bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
25419                   [MaskSize](int Idx) { return MaskSize <= Idx; });
25420 
25421   // At the moment we can only combine unary shuffle mask cases.
25422   if (UseInput0 && UseInput1)
25423     return false;
25424   else if (UseInput1) {
25425     std::swap(Input0, Input1);
25426     ShuffleVectorSDNode::commuteMask(Mask);
25427   }
25428 
25429   assert(Input0 && "Shuffle with no inputs detected");
25430 
25431   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
25432 
25433   // See if we can recurse into Input0 (if it's a target shuffle).
25434   if (Op->isOnlyUserOf(Input0.getNode()) &&
25435       combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
25436                                     HasVariableMask, DAG, DCI, Subtarget))
25437     return true;
25438 
25439   // Minor canonicalization of the accumulated shuffle mask to make it easier
25440   // to match below. All this does is detect masks with sequential pairs of
25441   // elements, and shrink them to the half-width mask. It does this in a loop
25442   // so it will reduce the size of the mask to the minimal width mask which
25443   // performs an equivalent shuffle.
25444   SmallVector<int, 16> WidenedMask;
25445   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
25446     Mask = std::move(WidenedMask);
25447   }
25448 
25449   return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
25450                                 DCI, Subtarget);
25451 }
25452 
25453 /// \brief Get the PSHUF-style mask from PSHUF node.
25454 ///
25455 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
25456 /// PSHUF-style masks that can be reused with such instructions.
getPSHUFShuffleMask(SDValue N)25457 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
25458   MVT VT = N.getSimpleValueType();
25459   SmallVector<int, 4> Mask;
25460   SmallVector<SDValue, 2> Ops;
25461   bool IsUnary;
25462   bool HaveMask =
25463       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
25464   (void)HaveMask;
25465   assert(HaveMask);
25466 
25467   // If we have more than 128-bits, only the low 128-bits of shuffle mask
25468   // matter. Check that the upper masks are repeats and remove them.
25469   if (VT.getSizeInBits() > 128) {
25470     int LaneElts = 128 / VT.getScalarSizeInBits();
25471 #ifndef NDEBUG
25472     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
25473       for (int j = 0; j < LaneElts; ++j)
25474         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
25475                "Mask doesn't repeat in high 128-bit lanes!");
25476 #endif
25477     Mask.resize(LaneElts);
25478   }
25479 
25480   switch (N.getOpcode()) {
25481   case X86ISD::PSHUFD:
25482     return Mask;
25483   case X86ISD::PSHUFLW:
25484     Mask.resize(4);
25485     return Mask;
25486   case X86ISD::PSHUFHW:
25487     Mask.erase(Mask.begin(), Mask.begin() + 4);
25488     for (int &M : Mask)
25489       M -= 4;
25490     return Mask;
25491   default:
25492     llvm_unreachable("No valid shuffle instruction found!");
25493   }
25494 }
25495 
25496 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
25497 ///
25498 /// We walk up the chain and look for a combinable shuffle, skipping over
25499 /// shuffles that we could hoist this shuffle's transformation past without
25500 /// altering anything.
25501 static SDValue
combineRedundantDWordShuffle(SDValue N,MutableArrayRef<int> Mask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)25502 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
25503                              SelectionDAG &DAG,
25504                              TargetLowering::DAGCombinerInfo &DCI) {
25505   assert(N.getOpcode() == X86ISD::PSHUFD &&
25506          "Called with something other than an x86 128-bit half shuffle!");
25507   SDLoc DL(N);
25508 
25509   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
25510   // of the shuffles in the chain so that we can form a fresh chain to replace
25511   // this one.
25512   SmallVector<SDValue, 8> Chain;
25513   SDValue V = N.getOperand(0);
25514   for (; V.hasOneUse(); V = V.getOperand(0)) {
25515     switch (V.getOpcode()) {
25516     default:
25517       return SDValue(); // Nothing combined!
25518 
25519     case ISD::BITCAST:
25520       // Skip bitcasts as we always know the type for the target specific
25521       // instructions.
25522       continue;
25523 
25524     case X86ISD::PSHUFD:
25525       // Found another dword shuffle.
25526       break;
25527 
25528     case X86ISD::PSHUFLW:
25529       // Check that the low words (being shuffled) are the identity in the
25530       // dword shuffle, and the high words are self-contained.
25531       if (Mask[0] != 0 || Mask[1] != 1 ||
25532           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
25533         return SDValue();
25534 
25535       Chain.push_back(V);
25536       continue;
25537 
25538     case X86ISD::PSHUFHW:
25539       // Check that the high words (being shuffled) are the identity in the
25540       // dword shuffle, and the low words are self-contained.
25541       if (Mask[2] != 2 || Mask[3] != 3 ||
25542           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
25543         return SDValue();
25544 
25545       Chain.push_back(V);
25546       continue;
25547 
25548     case X86ISD::UNPCKL:
25549     case X86ISD::UNPCKH:
25550       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
25551       // shuffle into a preceding word shuffle.
25552       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
25553           V.getSimpleValueType().getVectorElementType() != MVT::i16)
25554         return SDValue();
25555 
25556       // Search for a half-shuffle which we can combine with.
25557       unsigned CombineOp =
25558           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
25559       if (V.getOperand(0) != V.getOperand(1) ||
25560           !V->isOnlyUserOf(V.getOperand(0).getNode()))
25561         return SDValue();
25562       Chain.push_back(V);
25563       V = V.getOperand(0);
25564       do {
25565         switch (V.getOpcode()) {
25566         default:
25567           return SDValue(); // Nothing to combine.
25568 
25569         case X86ISD::PSHUFLW:
25570         case X86ISD::PSHUFHW:
25571           if (V.getOpcode() == CombineOp)
25572             break;
25573 
25574           Chain.push_back(V);
25575 
25576           // Fallthrough!
25577         case ISD::BITCAST:
25578           V = V.getOperand(0);
25579           continue;
25580         }
25581         break;
25582       } while (V.hasOneUse());
25583       break;
25584     }
25585     // Break out of the loop if we break out of the switch.
25586     break;
25587   }
25588 
25589   if (!V.hasOneUse())
25590     // We fell out of the loop without finding a viable combining instruction.
25591     return SDValue();
25592 
25593   // Merge this node's mask and our incoming mask.
25594   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25595   for (int &M : Mask)
25596     M = VMask[M];
25597   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
25598                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25599 
25600   // Rebuild the chain around this new shuffle.
25601   while (!Chain.empty()) {
25602     SDValue W = Chain.pop_back_val();
25603 
25604     if (V.getValueType() != W.getOperand(0).getValueType())
25605       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
25606 
25607     switch (W.getOpcode()) {
25608     default:
25609       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
25610 
25611     case X86ISD::UNPCKL:
25612     case X86ISD::UNPCKH:
25613       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
25614       break;
25615 
25616     case X86ISD::PSHUFD:
25617     case X86ISD::PSHUFLW:
25618     case X86ISD::PSHUFHW:
25619       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
25620       break;
25621     }
25622   }
25623   if (V.getValueType() != N.getValueType())
25624     V = DAG.getBitcast(N.getValueType(), V);
25625 
25626   // Return the new chain to replace N.
25627   return V;
25628 }
25629 
25630 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
25631 /// pshufhw.
25632 ///
25633 /// We walk up the chain, skipping shuffles of the other half and looking
25634 /// through shuffles which switch halves trying to find a shuffle of the same
25635 /// pair of dwords.
combineRedundantHalfShuffle(SDValue N,MutableArrayRef<int> Mask,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)25636 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
25637                                         SelectionDAG &DAG,
25638                                         TargetLowering::DAGCombinerInfo &DCI) {
25639   assert(
25640       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
25641       "Called with something other than an x86 128-bit half shuffle!");
25642   SDLoc DL(N);
25643   unsigned CombineOpcode = N.getOpcode();
25644 
25645   // Walk up a single-use chain looking for a combinable shuffle.
25646   SDValue V = N.getOperand(0);
25647   for (; V.hasOneUse(); V = V.getOperand(0)) {
25648     switch (V.getOpcode()) {
25649     default:
25650       return false; // Nothing combined!
25651 
25652     case ISD::BITCAST:
25653       // Skip bitcasts as we always know the type for the target specific
25654       // instructions.
25655       continue;
25656 
25657     case X86ISD::PSHUFLW:
25658     case X86ISD::PSHUFHW:
25659       if (V.getOpcode() == CombineOpcode)
25660         break;
25661 
25662       // Other-half shuffles are no-ops.
25663       continue;
25664     }
25665     // Break out of the loop if we break out of the switch.
25666     break;
25667   }
25668 
25669   if (!V.hasOneUse())
25670     // We fell out of the loop without finding a viable combining instruction.
25671     return false;
25672 
25673   // Combine away the bottom node as its shuffle will be accumulated into
25674   // a preceding shuffle.
25675   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25676 
25677   // Record the old value.
25678   SDValue Old = V;
25679 
25680   // Merge this node's mask and our incoming mask (adjusted to account for all
25681   // the pshufd instructions encountered).
25682   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25683   for (int &M : Mask)
25684     M = VMask[M];
25685   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
25686                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25687 
25688   // Check that the shuffles didn't cancel each other out. If not, we need to
25689   // combine to the new one.
25690   if (Old != V)
25691     // Replace the combinable shuffle with the combined one, updating all users
25692     // so that we re-evaluate the chain here.
25693     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
25694 
25695   return true;
25696 }
25697 
25698 /// \brief Try to combine x86 target specific shuffles.
combineTargetShuffle(SDValue N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)25699 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
25700                                     TargetLowering::DAGCombinerInfo &DCI,
25701                                     const X86Subtarget &Subtarget) {
25702   SDLoc DL(N);
25703   MVT VT = N.getSimpleValueType();
25704   SmallVector<int, 4> Mask;
25705 
25706   switch (N.getOpcode()) {
25707   case X86ISD::PSHUFD:
25708   case X86ISD::PSHUFLW:
25709   case X86ISD::PSHUFHW:
25710     Mask = getPSHUFShuffleMask(N);
25711     assert(Mask.size() == 4);
25712     break;
25713   case X86ISD::UNPCKL: {
25714     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
25715     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
25716     // moves upper half elements into the lower half part. For example:
25717     //
25718     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
25719     //     undef:v16i8
25720     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
25721     //
25722     // will be combined to:
25723     //
25724     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
25725 
25726     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
25727     // happen due to advanced instructions.
25728     if (!VT.is128BitVector())
25729       return SDValue();
25730 
25731     auto Op0 = N.getOperand(0);
25732     auto Op1 = N.getOperand(1);
25733     if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
25734       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
25735 
25736       unsigned NumElts = VT.getVectorNumElements();
25737       SmallVector<int, 8> ExpectedMask(NumElts, -1);
25738       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
25739                 NumElts / 2);
25740 
25741       auto ShufOp = Op1.getOperand(0);
25742       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
25743         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
25744     }
25745     return SDValue();
25746   }
25747   case X86ISD::BLENDI: {
25748     SDValue V0 = N->getOperand(0);
25749     SDValue V1 = N->getOperand(1);
25750     assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
25751            "Unexpected input vector types");
25752 
25753     // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
25754     // operands and changing the mask to 1. This saves us a bunch of
25755     // pattern-matching possibilities related to scalar math ops in SSE/AVX.
25756     // x86InstrInfo knows how to commute this back after instruction selection
25757     // if it would help register allocation.
25758 
25759     // TODO: If optimizing for size or a processor that doesn't suffer from
25760     // partial register update stalls, this should be transformed into a MOVSD
25761     // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
25762 
25763     if (VT == MVT::v2f64)
25764       if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
25765         if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
25766           SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
25767           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
25768         }
25769 
25770     // Attempt to merge blend(insertps(x,y),zero).
25771     if (V0.getOpcode() == X86ISD::INSERTPS ||
25772         V1.getOpcode() == X86ISD::INSERTPS) {
25773       assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25774 
25775       // Determine which elements are known to be zero.
25776       SmallVector<int, 8> TargetMask;
25777       SmallVector<SDValue, 2> BlendOps;
25778       if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
25779         return SDValue();
25780 
25781       // Helper function to take inner insertps node and attempt to
25782       // merge the blend with zero into its zero mask.
25783       auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
25784         if (V.getOpcode() != X86ISD::INSERTPS)
25785           return SDValue();
25786         SDValue Op0 = V.getOperand(0);
25787         SDValue Op1 = V.getOperand(1);
25788         SDValue Op2 = V.getOperand(2);
25789         unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25790 
25791         // Check each element of the blend node's target mask - must either
25792         // be zeroable (and update the zero mask) or selects the element from
25793         // the inner insertps node.
25794         for (int i = 0; i != 4; ++i)
25795           if (TargetMask[i] < 0)
25796             InsertPSMask |= (1u << i);
25797           else if (TargetMask[i] != (i + Offset))
25798             return SDValue();
25799         return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
25800                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
25801       };
25802 
25803       if (SDValue V = MergeInsertPSAndBlend(V0, 0))
25804         return V;
25805       if (SDValue V = MergeInsertPSAndBlend(V1, 4))
25806         return V;
25807     }
25808     return SDValue();
25809   }
25810   case X86ISD::INSERTPS: {
25811     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25812     SDValue Op0 = N.getOperand(0);
25813     SDValue Op1 = N.getOperand(1);
25814     SDValue Op2 = N.getOperand(2);
25815     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25816     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
25817     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
25818     unsigned ZeroMask = InsertPSMask & 0xF;
25819 
25820     // If we zero out all elements from Op0 then we don't need to reference it.
25821     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
25822       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
25823                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25824 
25825     // If we zero out the element from Op1 then we don't need to reference it.
25826     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
25827       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25828                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25829 
25830     // Attempt to merge insertps Op1 with an inner target shuffle node.
25831     SmallVector<int, 8> TargetMask1;
25832     SmallVector<SDValue, 2> Ops1;
25833     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
25834       int M = TargetMask1[SrcIdx];
25835       if (isUndefOrZero(M)) {
25836         // Zero/UNDEF insertion - zero out element and remove dependency.
25837         InsertPSMask |= (1u << DstIdx);
25838         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25839                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
25840       }
25841       // Update insertps mask srcidx and reference the source input directly.
25842       assert(0 <= M && M < 8 && "Shuffle index out of range");
25843       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
25844       Op1 = Ops1[M < 4 ? 0 : 1];
25845       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25846                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25847     }
25848 
25849     // Attempt to merge insertps Op0 with an inner target shuffle node.
25850     SmallVector<int, 8> TargetMask0;
25851     SmallVector<SDValue, 2> Ops0;
25852     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
25853       return SDValue();
25854 
25855     bool Updated = false;
25856     bool UseInput00 = false;
25857     bool UseInput01 = false;
25858     for (int i = 0; i != 4; ++i) {
25859       int M = TargetMask0[i];
25860       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
25861         // No change if element is already zero or the inserted element.
25862         continue;
25863       } else if (isUndefOrZero(M)) {
25864         // If the target mask is undef/zero then we must zero the element.
25865         InsertPSMask |= (1u << i);
25866         Updated = true;
25867         continue;
25868       }
25869 
25870       // The input vector element must be inline.
25871       if (M != i && M != (i + 4))
25872         return SDValue();
25873 
25874       // Determine which inputs of the target shuffle we're using.
25875       UseInput00 |= (0 <= M && M < 4);
25876       UseInput01 |= (4 <= M);
25877     }
25878 
25879     // If we're not using both inputs of the target shuffle then use the
25880     // referenced input directly.
25881     if (UseInput00 && !UseInput01) {
25882       Updated = true;
25883       Op0 = Ops0[0];
25884     } else if (!UseInput00 && UseInput01) {
25885       Updated = true;
25886       Op0 = Ops0[1];
25887     }
25888 
25889     if (Updated)
25890       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25891                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
25892 
25893     return SDValue();
25894   }
25895   default:
25896     return SDValue();
25897   }
25898 
25899   // Nuke no-op shuffles that show up after combining.
25900   if (isNoopShuffleMask(Mask))
25901     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25902 
25903   // Look for simplifications involving one or two shuffle instructions.
25904   SDValue V = N.getOperand(0);
25905   switch (N.getOpcode()) {
25906   default:
25907     break;
25908   case X86ISD::PSHUFLW:
25909   case X86ISD::PSHUFHW:
25910     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
25911 
25912     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
25913       return SDValue(); // We combined away this shuffle, so we're done.
25914 
25915     // See if this reduces to a PSHUFD which is no more expensive and can
25916     // combine with more operations. Note that it has to at least flip the
25917     // dwords as otherwise it would have been removed as a no-op.
25918     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
25919       int DMask[] = {0, 1, 2, 3};
25920       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
25921       DMask[DOffset + 0] = DOffset + 1;
25922       DMask[DOffset + 1] = DOffset + 0;
25923       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
25924       V = DAG.getBitcast(DVT, V);
25925       DCI.AddToWorklist(V.getNode());
25926       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
25927                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
25928       DCI.AddToWorklist(V.getNode());
25929       return DAG.getBitcast(VT, V);
25930     }
25931 
25932     // Look for shuffle patterns which can be implemented as a single unpack.
25933     // FIXME: This doesn't handle the location of the PSHUFD generically, and
25934     // only works when we have a PSHUFD followed by two half-shuffles.
25935     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
25936         (V.getOpcode() == X86ISD::PSHUFLW ||
25937          V.getOpcode() == X86ISD::PSHUFHW) &&
25938         V.getOpcode() != N.getOpcode() &&
25939         V.hasOneUse()) {
25940       SDValue D = V.getOperand(0);
25941       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
25942         D = D.getOperand(0);
25943       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
25944         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25945         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
25946         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25947         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25948         int WordMask[8];
25949         for (int i = 0; i < 4; ++i) {
25950           WordMask[i + NOffset] = Mask[i] + NOffset;
25951           WordMask[i + VOffset] = VMask[i] + VOffset;
25952         }
25953         // Map the word mask through the DWord mask.
25954         int MappedMask[8];
25955         for (int i = 0; i < 8; ++i)
25956           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
25957         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
25958             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
25959           // We can replace all three shuffles with an unpack.
25960           V = DAG.getBitcast(VT, D.getOperand(0));
25961           DCI.AddToWorklist(V.getNode());
25962           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
25963                                                 : X86ISD::UNPCKH,
25964                              DL, VT, V, V);
25965         }
25966       }
25967     }
25968 
25969     break;
25970 
25971   case X86ISD::PSHUFD:
25972     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
25973       return NewN;
25974 
25975     break;
25976   }
25977 
25978   return SDValue();
25979 }
25980 
25981 /// \brief Try to combine a shuffle into a target-specific add-sub node.
25982 ///
25983 /// We combine this directly on the abstract vector shuffle nodes so it is
25984 /// easier to generically match. We also insert dummy vector shuffle nodes for
25985 /// the operands which explicitly discard the lanes which are unused by this
25986 /// operation to try to flow through the rest of the combiner the fact that
25987 /// they're unused.
combineShuffleToAddSub(SDNode * N,const X86Subtarget & Subtarget,SelectionDAG & DAG)25988 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
25989                                       SelectionDAG &DAG) {
25990   SDLoc DL(N);
25991   EVT VT = N->getValueType(0);
25992   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
25993       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
25994     return SDValue();
25995 
25996   // We only handle target-independent shuffles.
25997   // FIXME: It would be easy and harmless to use the target shuffle mask
25998   // extraction tool to support more.
25999   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
26000     return SDValue();
26001 
26002   auto *SVN = cast<ShuffleVectorSDNode>(N);
26003   SmallVector<int, 8> Mask;
26004   for (int M : SVN->getMask())
26005     Mask.push_back(M);
26006 
26007   SDValue V1 = N->getOperand(0);
26008   SDValue V2 = N->getOperand(1);
26009 
26010   // We require the first shuffle operand to be the FSUB node, and the second to
26011   // be the FADD node.
26012   if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
26013     ShuffleVectorSDNode::commuteMask(Mask);
26014     std::swap(V1, V2);
26015   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
26016     return SDValue();
26017 
26018   // If there are other uses of these operations we can't fold them.
26019   if (!V1->hasOneUse() || !V2->hasOneUse())
26020     return SDValue();
26021 
26022   // Ensure that both operations have the same operands. Note that we can
26023   // commute the FADD operands.
26024   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
26025   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
26026       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
26027     return SDValue();
26028 
26029   // We're looking for blends between FADD and FSUB nodes. We insist on these
26030   // nodes being lined up in a specific expected pattern.
26031   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
26032         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
26033         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
26034     return SDValue();
26035 
26036   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
26037 }
26038 
combineShuffle(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)26039 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
26040                               TargetLowering::DAGCombinerInfo &DCI,
26041                               const X86Subtarget &Subtarget) {
26042   SDLoc dl(N);
26043   EVT VT = N->getValueType(0);
26044 
26045   // Don't create instructions with illegal types after legalize types has run.
26046   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26047   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
26048     return SDValue();
26049 
26050   // If we have legalized the vector types, look for blends of FADD and FSUB
26051   // nodes that we can fuse into an ADDSUB node.
26052   if (TLI.isTypeLegal(VT))
26053     if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
26054       return AddSub;
26055 
26056   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
26057   if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
26058       N->getOpcode() == ISD::VECTOR_SHUFFLE)
26059     return combineShuffle256(N, DAG, DCI, Subtarget);
26060 
26061   // During Type Legalization, when promoting illegal vector types,
26062   // the backend might introduce new shuffle dag nodes and bitcasts.
26063   //
26064   // This code performs the following transformation:
26065   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
26066   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
26067   //
26068   // We do this only if both the bitcast and the BINOP dag nodes have
26069   // one use. Also, perform this transformation only if the new binary
26070   // operation is legal. This is to avoid introducing dag nodes that
26071   // potentially need to be further expanded (or custom lowered) into a
26072   // less optimal sequence of dag nodes.
26073   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
26074       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
26075       N->getOperand(0).getOpcode() == ISD::BITCAST &&
26076       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
26077     SDValue N0 = N->getOperand(0);
26078     SDValue N1 = N->getOperand(1);
26079 
26080     SDValue BC0 = N0.getOperand(0);
26081     EVT SVT = BC0.getValueType();
26082     unsigned Opcode = BC0.getOpcode();
26083     unsigned NumElts = VT.getVectorNumElements();
26084 
26085     if (BC0.hasOneUse() && SVT.isVector() &&
26086         SVT.getVectorNumElements() * 2 == NumElts &&
26087         TLI.isOperationLegal(Opcode, VT)) {
26088       bool CanFold = false;
26089       switch (Opcode) {
26090       default : break;
26091       case ISD::ADD :
26092       case ISD::FADD :
26093       case ISD::SUB :
26094       case ISD::FSUB :
26095       case ISD::MUL :
26096       case ISD::FMUL :
26097         CanFold = true;
26098       }
26099 
26100       unsigned SVTNumElts = SVT.getVectorNumElements();
26101       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
26102       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
26103         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
26104       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
26105         CanFold = SVOp->getMaskElt(i) < 0;
26106 
26107       if (CanFold) {
26108         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
26109         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
26110         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
26111         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
26112       }
26113     }
26114   }
26115 
26116   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
26117   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
26118   // consecutive, non-overlapping, and in the right order.
26119   SmallVector<SDValue, 16> Elts;
26120   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
26121     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
26122 
26123   if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
26124     return LD;
26125 
26126   if (isTargetShuffle(N->getOpcode())) {
26127     if (SDValue Shuffle =
26128             combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
26129       return Shuffle;
26130 
26131     // Try recursively combining arbitrary sequences of x86 shuffle
26132     // instructions into higher-order shuffles. We do this after combining
26133     // specific PSHUF instruction sequences into their minimal form so that we
26134     // can evaluate how many specialized shuffle instructions are involved in
26135     // a particular chain.
26136     SmallVector<int, 1> NonceMask; // Just a placeholder.
26137     NonceMask.push_back(0);
26138     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
26139                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
26140                                       DCI, Subtarget))
26141       return SDValue(); // This routine will use CombineTo to replace N.
26142   }
26143 
26144   return SDValue();
26145 }
26146 
26147 /// Check if a vector extract from a target-specific shuffle of a load can be
26148 /// folded into a single element load.
26149 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
26150 /// shuffles have been custom lowered so we need to handle those here.
XFormVExtractWithShuffleIntoLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)26151 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
26152                                          TargetLowering::DAGCombinerInfo &DCI) {
26153   if (DCI.isBeforeLegalizeOps())
26154     return SDValue();
26155 
26156   SDValue InVec = N->getOperand(0);
26157   SDValue EltNo = N->getOperand(1);
26158   EVT EltVT = N->getValueType(0);
26159 
26160   if (!isa<ConstantSDNode>(EltNo))
26161     return SDValue();
26162 
26163   EVT OriginalVT = InVec.getValueType();
26164 
26165   if (InVec.getOpcode() == ISD::BITCAST) {
26166     // Don't duplicate a load with other uses.
26167     if (!InVec.hasOneUse())
26168       return SDValue();
26169     EVT BCVT = InVec.getOperand(0).getValueType();
26170     if (!BCVT.isVector() ||
26171         BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
26172       return SDValue();
26173     InVec = InVec.getOperand(0);
26174   }
26175 
26176   EVT CurrentVT = InVec.getValueType();
26177 
26178   if (!isTargetShuffle(InVec.getOpcode()))
26179     return SDValue();
26180 
26181   // Don't duplicate a load with other uses.
26182   if (!InVec.hasOneUse())
26183     return SDValue();
26184 
26185   SmallVector<int, 16> ShuffleMask;
26186   SmallVector<SDValue, 2> ShuffleOps;
26187   bool UnaryShuffle;
26188   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
26189                             ShuffleOps, ShuffleMask, UnaryShuffle))
26190     return SDValue();
26191 
26192   // Select the input vector, guarding against out of range extract vector.
26193   unsigned NumElems = CurrentVT.getVectorNumElements();
26194   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
26195   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
26196 
26197   if (Idx == SM_SentinelZero)
26198     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
26199                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
26200   if (Idx == SM_SentinelUndef)
26201     return DAG.getUNDEF(EltVT);
26202 
26203   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
26204   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
26205                                          : ShuffleOps[1];
26206 
26207   // If inputs to shuffle are the same for both ops, then allow 2 uses
26208   unsigned AllowedUses =
26209       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
26210 
26211   if (LdNode.getOpcode() == ISD::BITCAST) {
26212     // Don't duplicate a load with other uses.
26213     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
26214       return SDValue();
26215 
26216     AllowedUses = 1; // only allow 1 load use if we have a bitcast
26217     LdNode = LdNode.getOperand(0);
26218   }
26219 
26220   if (!ISD::isNormalLoad(LdNode.getNode()))
26221     return SDValue();
26222 
26223   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
26224 
26225   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
26226     return SDValue();
26227 
26228   // If there's a bitcast before the shuffle, check if the load type and
26229   // alignment is valid.
26230   unsigned Align = LN0->getAlignment();
26231   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26232   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
26233       EltVT.getTypeForEVT(*DAG.getContext()));
26234 
26235   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
26236     return SDValue();
26237 
26238   // All checks match so transform back to vector_shuffle so that DAG combiner
26239   // can finish the job
26240   SDLoc dl(N);
26241 
26242   // Create shuffle node taking into account the case that its a unary shuffle
26243   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
26244   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
26245                                  ShuffleMask);
26246   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
26247   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
26248                      EltNo);
26249 }
26250 
combineBitcast(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)26251 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
26252                               const X86Subtarget &Subtarget) {
26253   SDValue N0 = N->getOperand(0);
26254   EVT VT = N->getValueType(0);
26255 
26256   // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
26257   // special and don't usually play with other vector types, it's better to
26258   // handle them early to be sure we emit efficient code by avoiding
26259   // store-load conversions.
26260   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
26261       N0.getValueType() == MVT::v2i32 &&
26262       isNullConstant(N0.getOperand(1))) {
26263     SDValue N00 = N0->getOperand(0);
26264     if (N00.getValueType() == MVT::i32)
26265       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
26266   }
26267 
26268   // Convert a bitcasted integer logic operation that has one bitcasted
26269   // floating-point operand and one constant operand into a floating-point
26270   // logic operation. This may create a load of the constant, but that is
26271   // cheaper than materializing the constant in an integer register and
26272   // transferring it to an SSE register or transferring the SSE operand to
26273   // integer register and back.
26274   unsigned FPOpcode;
26275   switch (N0.getOpcode()) {
26276     case ISD::AND: FPOpcode = X86ISD::FAND; break;
26277     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
26278     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
26279     default: return SDValue();
26280   }
26281   if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
26282        (Subtarget.hasSSE2() && VT == MVT::f64)) &&
26283       isa<ConstantSDNode>(N0.getOperand(1)) &&
26284       N0.getOperand(0).getOpcode() == ISD::BITCAST &&
26285       N0.getOperand(0).getOperand(0).getValueType() == VT) {
26286     SDValue N000 = N0.getOperand(0).getOperand(0);
26287     SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
26288     return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
26289   }
26290 
26291   return SDValue();
26292 }
26293 
26294 /// Detect vector gather/scatter index generation and convert it from being a
26295 /// bunch of shuffles and extracts into a somewhat faster sequence.
26296 /// For i686, the best sequence is apparently storing the value and loading
26297 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
combineExtractVectorElt(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)26298 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
26299                                        TargetLowering::DAGCombinerInfo &DCI) {
26300   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
26301     return NewOp;
26302 
26303   SDValue InputVector = N->getOperand(0);
26304   SDLoc dl(InputVector);
26305   // Detect mmx to i32 conversion through a v2i32 elt extract.
26306   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
26307       N->getValueType(0) == MVT::i32 &&
26308       InputVector.getValueType() == MVT::v2i32 &&
26309       isa<ConstantSDNode>(N->getOperand(1)) &&
26310       N->getConstantOperandVal(1) == 0) {
26311     SDValue MMXSrc = InputVector.getNode()->getOperand(0);
26312 
26313     // The bitcast source is a direct mmx result.
26314     if (MMXSrc.getValueType() == MVT::x86mmx)
26315       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
26316   }
26317 
26318   EVT VT = N->getValueType(0);
26319 
26320   if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
26321       InputVector.getOpcode() == ISD::BITCAST &&
26322       isa<ConstantSDNode>(InputVector.getOperand(0))) {
26323     uint64_t ExtractedElt =
26324         cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
26325     uint64_t InputValue =
26326         cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
26327     uint64_t Res = (InputValue >> ExtractedElt) & 1;
26328     return DAG.getConstant(Res, dl, MVT::i1);
26329   }
26330   // Only operate on vectors of 4 elements, where the alternative shuffling
26331   // gets to be more expensive.
26332   if (InputVector.getValueType() != MVT::v4i32)
26333     return SDValue();
26334 
26335   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
26336   // single use which is a sign-extend or zero-extend, and all elements are
26337   // used.
26338   SmallVector<SDNode *, 4> Uses;
26339   unsigned ExtractedElements = 0;
26340   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
26341        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
26342     if (UI.getUse().getResNo() != InputVector.getResNo())
26343       return SDValue();
26344 
26345     SDNode *Extract = *UI;
26346     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26347       return SDValue();
26348 
26349     if (Extract->getValueType(0) != MVT::i32)
26350       return SDValue();
26351     if (!Extract->hasOneUse())
26352       return SDValue();
26353     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
26354         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
26355       return SDValue();
26356     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
26357       return SDValue();
26358 
26359     // Record which element was extracted.
26360     ExtractedElements |=
26361       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
26362 
26363     Uses.push_back(Extract);
26364   }
26365 
26366   // If not all the elements were used, this may not be worthwhile.
26367   if (ExtractedElements != 15)
26368     return SDValue();
26369 
26370   // Ok, we've now decided to do the transformation.
26371   // If 64-bit shifts are legal, use the extract-shift sequence,
26372   // otherwise bounce the vector off the cache.
26373   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26374   SDValue Vals[4];
26375 
26376   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
26377     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
26378     auto &DL = DAG.getDataLayout();
26379     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
26380     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26381       DAG.getConstant(0, dl, VecIdxTy));
26382     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26383       DAG.getConstant(1, dl, VecIdxTy));
26384 
26385     SDValue ShAmt = DAG.getConstant(
26386         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
26387     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
26388     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26389       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
26390     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
26391     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26392       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
26393   } else {
26394     // Store the value to a temporary stack slot.
26395     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
26396     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
26397       MachinePointerInfo(), false, false, 0);
26398 
26399     EVT ElementType = InputVector.getValueType().getVectorElementType();
26400     unsigned EltSize = ElementType.getSizeInBits() / 8;
26401 
26402     // Replace each use (extract) with a load of the appropriate element.
26403     for (unsigned i = 0; i < 4; ++i) {
26404       uint64_t Offset = EltSize * i;
26405       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26406       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
26407 
26408       SDValue ScalarAddr =
26409           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
26410 
26411       // Load the scalar.
26412       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
26413                             ScalarAddr, MachinePointerInfo(),
26414                             false, false, false, 0);
26415 
26416     }
26417   }
26418 
26419   // Replace the extracts
26420   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
26421     UE = Uses.end(); UI != UE; ++UI) {
26422     SDNode *Extract = *UI;
26423 
26424     SDValue Idx = Extract->getOperand(1);
26425     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
26426     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
26427   }
26428 
26429   // The replacement was made in place; don't return anything.
26430   return SDValue();
26431 }
26432 
26433 /// Do target-specific dag combines on SELECT and VSELECT nodes.
combineSelect(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)26434 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
26435                              TargetLowering::DAGCombinerInfo &DCI,
26436                              const X86Subtarget &Subtarget) {
26437   SDLoc DL(N);
26438   SDValue Cond = N->getOperand(0);
26439   // Get the LHS/RHS of the select.
26440   SDValue LHS = N->getOperand(1);
26441   SDValue RHS = N->getOperand(2);
26442   EVT VT = LHS.getValueType();
26443   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26444 
26445   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
26446   // instructions match the semantics of the common C idiom x<y?x:y but not
26447   // x<=y?x:y, because of how they handle negative zero (which can be
26448   // ignored in unsafe-math mode).
26449   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
26450   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
26451       VT != MVT::f80 && VT != MVT::f128 &&
26452       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
26453       (Subtarget.hasSSE2() ||
26454        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
26455     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26456 
26457     unsigned Opcode = 0;
26458     // Check for x CC y ? x : y.
26459     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26460         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26461       switch (CC) {
26462       default: break;
26463       case ISD::SETULT:
26464         // Converting this to a min would handle NaNs incorrectly, and swapping
26465         // the operands would cause it to handle comparisons between positive
26466         // and negative zero incorrectly.
26467         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26468           if (!DAG.getTarget().Options.UnsafeFPMath &&
26469               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26470             break;
26471           std::swap(LHS, RHS);
26472         }
26473         Opcode = X86ISD::FMIN;
26474         break;
26475       case ISD::SETOLE:
26476         // Converting this to a min would handle comparisons between positive
26477         // and negative zero incorrectly.
26478         if (!DAG.getTarget().Options.UnsafeFPMath &&
26479             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26480           break;
26481         Opcode = X86ISD::FMIN;
26482         break;
26483       case ISD::SETULE:
26484         // Converting this to a min would handle both negative zeros and NaNs
26485         // incorrectly, but we can swap the operands to fix both.
26486         std::swap(LHS, RHS);
26487       case ISD::SETOLT:
26488       case ISD::SETLT:
26489       case ISD::SETLE:
26490         Opcode = X86ISD::FMIN;
26491         break;
26492 
26493       case ISD::SETOGE:
26494         // Converting this to a max would handle comparisons between positive
26495         // and negative zero incorrectly.
26496         if (!DAG.getTarget().Options.UnsafeFPMath &&
26497             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26498           break;
26499         Opcode = X86ISD::FMAX;
26500         break;
26501       case ISD::SETUGT:
26502         // Converting this to a max would handle NaNs incorrectly, and swapping
26503         // the operands would cause it to handle comparisons between positive
26504         // and negative zero incorrectly.
26505         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26506           if (!DAG.getTarget().Options.UnsafeFPMath &&
26507               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26508             break;
26509           std::swap(LHS, RHS);
26510         }
26511         Opcode = X86ISD::FMAX;
26512         break;
26513       case ISD::SETUGE:
26514         // Converting this to a max would handle both negative zeros and NaNs
26515         // incorrectly, but we can swap the operands to fix both.
26516         std::swap(LHS, RHS);
26517       case ISD::SETOGT:
26518       case ISD::SETGT:
26519       case ISD::SETGE:
26520         Opcode = X86ISD::FMAX;
26521         break;
26522       }
26523     // Check for x CC y ? y : x -- a min/max with reversed arms.
26524     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
26525                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
26526       switch (CC) {
26527       default: break;
26528       case ISD::SETOGE:
26529         // Converting this to a min would handle comparisons between positive
26530         // and negative zero incorrectly, and swapping the operands would
26531         // cause it to handle NaNs incorrectly.
26532         if (!DAG.getTarget().Options.UnsafeFPMath &&
26533             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
26534           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26535             break;
26536           std::swap(LHS, RHS);
26537         }
26538         Opcode = X86ISD::FMIN;
26539         break;
26540       case ISD::SETUGT:
26541         // Converting this to a min would handle NaNs incorrectly.
26542         if (!DAG.getTarget().Options.UnsafeFPMath &&
26543             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
26544           break;
26545         Opcode = X86ISD::FMIN;
26546         break;
26547       case ISD::SETUGE:
26548         // Converting this to a min would handle both negative zeros and NaNs
26549         // incorrectly, but we can swap the operands to fix both.
26550         std::swap(LHS, RHS);
26551       case ISD::SETOGT:
26552       case ISD::SETGT:
26553       case ISD::SETGE:
26554         Opcode = X86ISD::FMIN;
26555         break;
26556 
26557       case ISD::SETULT:
26558         // Converting this to a max would handle NaNs incorrectly.
26559         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26560           break;
26561         Opcode = X86ISD::FMAX;
26562         break;
26563       case ISD::SETOLE:
26564         // Converting this to a max would handle comparisons between positive
26565         // and negative zero incorrectly, and swapping the operands would
26566         // cause it to handle NaNs incorrectly.
26567         if (!DAG.getTarget().Options.UnsafeFPMath &&
26568             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
26569           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26570             break;
26571           std::swap(LHS, RHS);
26572         }
26573         Opcode = X86ISD::FMAX;
26574         break;
26575       case ISD::SETULE:
26576         // Converting this to a max would handle both negative zeros and NaNs
26577         // incorrectly, but we can swap the operands to fix both.
26578         std::swap(LHS, RHS);
26579       case ISD::SETOLT:
26580       case ISD::SETLT:
26581       case ISD::SETLE:
26582         Opcode = X86ISD::FMAX;
26583         break;
26584       }
26585     }
26586 
26587     if (Opcode)
26588       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
26589   }
26590 
26591   EVT CondVT = Cond.getValueType();
26592   if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
26593       CondVT.getVectorElementType() == MVT::i1) {
26594     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
26595     // lowering on KNL. In this case we convert it to
26596     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
26597     // The same situation for all 128 and 256-bit vectors of i8 and i16.
26598     // Since SKX these selects have a proper lowering.
26599     EVT OpVT = LHS.getValueType();
26600     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
26601         (OpVT.getVectorElementType() == MVT::i8 ||
26602          OpVT.getVectorElementType() == MVT::i16) &&
26603         !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
26604       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
26605       DCI.AddToWorklist(Cond.getNode());
26606       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
26607     }
26608   }
26609   // If this is a select between two integer constants, try to do some
26610   // optimizations.
26611   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
26612     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
26613       // Don't do this for crazy integer types.
26614       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
26615         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
26616         // so that TrueC (the true value) is larger than FalseC.
26617         bool NeedsCondInvert = false;
26618 
26619         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
26620             // Efficiently invertible.
26621             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
26622              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
26623               isa<ConstantSDNode>(Cond.getOperand(1))))) {
26624           NeedsCondInvert = true;
26625           std::swap(TrueC, FalseC);
26626         }
26627 
26628         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
26629         if (FalseC->getAPIntValue() == 0 &&
26630             TrueC->getAPIntValue().isPowerOf2()) {
26631           if (NeedsCondInvert) // Invert the condition if needed.
26632             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26633                                DAG.getConstant(1, DL, Cond.getValueType()));
26634 
26635           // Zero extend the condition if needed.
26636           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
26637 
26638           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
26639           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
26640                              DAG.getConstant(ShAmt, DL, MVT::i8));
26641         }
26642 
26643         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
26644         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
26645           if (NeedsCondInvert) // Invert the condition if needed.
26646             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26647                                DAG.getConstant(1, DL, Cond.getValueType()));
26648 
26649           // Zero extend the condition if needed.
26650           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
26651                              FalseC->getValueType(0), Cond);
26652           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26653                              SDValue(FalseC, 0));
26654         }
26655 
26656         // Optimize cases that will turn into an LEA instruction.  This requires
26657         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
26658         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
26659           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
26660           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
26661 
26662           bool isFastMultiplier = false;
26663           if (Diff < 10) {
26664             switch ((unsigned char)Diff) {
26665               default: break;
26666               case 1:  // result = add base, cond
26667               case 2:  // result = lea base(    , cond*2)
26668               case 3:  // result = lea base(cond, cond*2)
26669               case 4:  // result = lea base(    , cond*4)
26670               case 5:  // result = lea base(cond, cond*4)
26671               case 8:  // result = lea base(    , cond*8)
26672               case 9:  // result = lea base(cond, cond*8)
26673                 isFastMultiplier = true;
26674                 break;
26675             }
26676           }
26677 
26678           if (isFastMultiplier) {
26679             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
26680             if (NeedsCondInvert) // Invert the condition if needed.
26681               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26682                                  DAG.getConstant(1, DL, Cond.getValueType()));
26683 
26684             // Zero extend the condition if needed.
26685             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
26686                                Cond);
26687             // Scale the condition by the difference.
26688             if (Diff != 1)
26689               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
26690                                  DAG.getConstant(Diff, DL,
26691                                                  Cond.getValueType()));
26692 
26693             // Add the base if non-zero.
26694             if (FalseC->getAPIntValue() != 0)
26695               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26696                                  SDValue(FalseC, 0));
26697             return Cond;
26698           }
26699         }
26700       }
26701   }
26702 
26703   // Canonicalize max and min:
26704   // (x > y) ? x : y -> (x >= y) ? x : y
26705   // (x < y) ? x : y -> (x <= y) ? x : y
26706   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
26707   // the need for an extra compare
26708   // against zero. e.g.
26709   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
26710   // subl   %esi, %edi
26711   // testl  %edi, %edi
26712   // movl   $0, %eax
26713   // cmovgl %edi, %eax
26714   // =>
26715   // xorl   %eax, %eax
26716   // subl   %esi, $edi
26717   // cmovsl %eax, %edi
26718   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
26719       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26720       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26721     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26722     switch (CC) {
26723     default: break;
26724     case ISD::SETLT:
26725     case ISD::SETGT: {
26726       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
26727       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
26728                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
26729       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
26730     }
26731     }
26732   }
26733 
26734   // Early exit check
26735   if (!TLI.isTypeLegal(VT))
26736     return SDValue();
26737 
26738   // Match VSELECTs into subs with unsigned saturation.
26739   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
26740       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
26741       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
26742        (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
26743     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26744 
26745     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
26746     // left side invert the predicate to simplify logic below.
26747     SDValue Other;
26748     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
26749       Other = RHS;
26750       CC = ISD::getSetCCInverse(CC, true);
26751     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
26752       Other = LHS;
26753     }
26754 
26755     if (Other.getNode() && Other->getNumOperands() == 2 &&
26756         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
26757       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
26758       SDValue CondRHS = Cond->getOperand(1);
26759 
26760       // Look for a general sub with unsigned saturation first.
26761       // x >= y ? x-y : 0 --> subus x, y
26762       // x >  y ? x-y : 0 --> subus x, y
26763       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
26764           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
26765         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
26766 
26767       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
26768         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
26769           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
26770             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
26771               // If the RHS is a constant we have to reverse the const
26772               // canonicalization.
26773               // x > C-1 ? x+-C : 0 --> subus x, C
26774               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
26775                   CondRHSConst->getAPIntValue() ==
26776                       (-OpRHSConst->getAPIntValue() - 1))
26777                 return DAG.getNode(
26778                     X86ISD::SUBUS, DL, VT, OpLHS,
26779                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
26780 
26781           // Another special case: If C was a sign bit, the sub has been
26782           // canonicalized into a xor.
26783           // FIXME: Would it be better to use computeKnownBits to determine
26784           //        whether it's safe to decanonicalize the xor?
26785           // x s< 0 ? x^C : 0 --> subus x, C
26786           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
26787               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
26788               OpRHSConst->getAPIntValue().isSignBit())
26789             // Note that we have to rebuild the RHS constant here to ensure we
26790             // don't rely on particular values of undef lanes.
26791             return DAG.getNode(
26792                 X86ISD::SUBUS, DL, VT, OpLHS,
26793                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
26794         }
26795     }
26796   }
26797 
26798   // Simplify vector selection if condition value type matches vselect
26799   // operand type
26800   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
26801     assert(Cond.getValueType().isVector() &&
26802            "vector select expects a vector selector!");
26803 
26804     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
26805     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
26806 
26807     // Try invert the condition if true value is not all 1s and false value
26808     // is not all 0s.
26809     if (!TValIsAllOnes && !FValIsAllZeros &&
26810         // Check if the selector will be produced by CMPP*/PCMP*
26811         Cond.getOpcode() == ISD::SETCC &&
26812         // Check if SETCC has already been promoted
26813         TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
26814             CondVT) {
26815       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
26816       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
26817 
26818       if (TValIsAllZeros || FValIsAllOnes) {
26819         SDValue CC = Cond.getOperand(2);
26820         ISD::CondCode NewCC =
26821           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
26822                                Cond.getOperand(0).getValueType().isInteger());
26823         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
26824         std::swap(LHS, RHS);
26825         TValIsAllOnes = FValIsAllOnes;
26826         FValIsAllZeros = TValIsAllZeros;
26827       }
26828     }
26829 
26830     if (TValIsAllOnes || FValIsAllZeros) {
26831       SDValue Ret;
26832 
26833       if (TValIsAllOnes && FValIsAllZeros)
26834         Ret = Cond;
26835       else if (TValIsAllOnes)
26836         Ret =
26837             DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
26838       else if (FValIsAllZeros)
26839         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
26840                           DAG.getBitcast(CondVT, LHS));
26841 
26842       return DAG.getBitcast(VT, Ret);
26843     }
26844   }
26845 
26846   // If this is a *dynamic* select (non-constant condition) and we can match
26847   // this node with one of the variable blend instructions, restructure the
26848   // condition so that the blends can use the high bit of each element and use
26849   // SimplifyDemandedBits to simplify the condition operand.
26850   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
26851       !DCI.isBeforeLegalize() &&
26852       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
26853     unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
26854 
26855     // Don't optimize vector selects that map to mask-registers.
26856     if (BitWidth == 1)
26857       return SDValue();
26858 
26859     // We can only handle the cases where VSELECT is directly legal on the
26860     // subtarget. We custom lower VSELECT nodes with constant conditions and
26861     // this makes it hard to see whether a dynamic VSELECT will correctly
26862     // lower, so we both check the operation's status and explicitly handle the
26863     // cases where a *dynamic* blend will fail even though a constant-condition
26864     // blend could be custom lowered.
26865     // FIXME: We should find a better way to handle this class of problems.
26866     // Potentially, we should combine constant-condition vselect nodes
26867     // pre-legalization into shuffles and not mark as many types as custom
26868     // lowered.
26869     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
26870       return SDValue();
26871     // FIXME: We don't support i16-element blends currently. We could and
26872     // should support them by making *all* the bits in the condition be set
26873     // rather than just the high bit and using an i8-element blend.
26874     if (VT.getVectorElementType() == MVT::i16)
26875       return SDValue();
26876     // Dynamic blending was only available from SSE4.1 onward.
26877     if (VT.is128BitVector() && !Subtarget.hasSSE41())
26878       return SDValue();
26879     // Byte blends are only available in AVX2
26880     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
26881       return SDValue();
26882 
26883     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
26884     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
26885 
26886     APInt KnownZero, KnownOne;
26887     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
26888                                           DCI.isBeforeLegalizeOps());
26889     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
26890         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
26891                                  TLO)) {
26892       // If we changed the computation somewhere in the DAG, this change
26893       // will affect all users of Cond.
26894       // Make sure it is fine and update all the nodes so that we do not
26895       // use the generic VSELECT anymore. Otherwise, we may perform
26896       // wrong optimizations as we messed up with the actual expectation
26897       // for the vector boolean values.
26898       if (Cond != TLO.Old) {
26899         // Check all uses of that condition operand to check whether it will be
26900         // consumed by non-BLEND instructions, which may depend on all bits are
26901         // set properly.
26902         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26903              I != E; ++I)
26904           if (I->getOpcode() != ISD::VSELECT)
26905             // TODO: Add other opcodes eventually lowered into BLEND.
26906             return SDValue();
26907 
26908         // Update all the users of the condition, before committing the change,
26909         // so that the VSELECT optimizations that expect the correct vector
26910         // boolean value will not be triggered.
26911         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26912              I != E; ++I)
26913           DAG.ReplaceAllUsesOfValueWith(
26914               SDValue(*I, 0),
26915               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
26916                           Cond, I->getOperand(1), I->getOperand(2)));
26917         DCI.CommitTargetLoweringOpt(TLO);
26918         return SDValue();
26919       }
26920       // At this point, only Cond is changed. Change the condition
26921       // just for N to keep the opportunity to optimize all other
26922       // users their own way.
26923       DAG.ReplaceAllUsesOfValueWith(
26924           SDValue(N, 0),
26925           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
26926                       TLO.New, N->getOperand(1), N->getOperand(2)));
26927       return SDValue();
26928     }
26929   }
26930 
26931   return SDValue();
26932 }
26933 
26934 /// Combine:
26935 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
26936 /// to:
26937 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
26938 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
26939 /// Note that this is only legal for some op/cc combinations.
combineSetCCAtomicArith(SDValue Cmp,X86::CondCode & CC,SelectionDAG & DAG)26940 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
26941                                        SelectionDAG &DAG) {
26942   // This combine only operates on CMP-like nodes.
26943   if (!(Cmp.getOpcode() == X86ISD::CMP ||
26944         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
26945     return SDValue();
26946 
26947   // This only applies to variations of the common case:
26948   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
26949   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
26950   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
26951   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
26952   // Using the proper condcodes (see below), overflow is checked for.
26953 
26954   // FIXME: We can generalize both constraints:
26955   // - XOR/OR/AND (if they were made to survive AtomicExpand)
26956   // - LHS != 1
26957   // if the result is compared.
26958 
26959   SDValue CmpLHS = Cmp.getOperand(0);
26960   SDValue CmpRHS = Cmp.getOperand(1);
26961 
26962   if (!CmpLHS.hasOneUse())
26963     return SDValue();
26964 
26965   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
26966   if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
26967     return SDValue();
26968 
26969   const unsigned Opc = CmpLHS.getOpcode();
26970 
26971   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
26972     return SDValue();
26973 
26974   SDValue OpRHS = CmpLHS.getOperand(2);
26975   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
26976   if (!OpRHSC)
26977     return SDValue();
26978 
26979   APInt Addend = OpRHSC->getAPIntValue();
26980   if (Opc == ISD::ATOMIC_LOAD_SUB)
26981     Addend = -Addend;
26982 
26983   if (CC == X86::COND_S && Addend == 1)
26984     CC = X86::COND_LE;
26985   else if (CC == X86::COND_NS && Addend == 1)
26986     CC = X86::COND_G;
26987   else if (CC == X86::COND_G && Addend == -1)
26988     CC = X86::COND_GE;
26989   else if (CC == X86::COND_LE && Addend == -1)
26990     CC = X86::COND_L;
26991   else
26992     return SDValue();
26993 
26994   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
26995   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
26996                                 DAG.getUNDEF(CmpLHS.getValueType()));
26997   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
26998   return LockOp;
26999 }
27000 
27001 // Check whether a boolean test is testing a boolean value generated by
27002 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
27003 // code.
27004 //
27005 // Simplify the following patterns:
27006 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
27007 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
27008 // to (Op EFLAGS Cond)
27009 //
27010 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
27011 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
27012 // to (Op EFLAGS !Cond)
27013 //
27014 // where Op could be BRCOND or CMOV.
27015 //
checkBoolTestSetCCCombine(SDValue Cmp,X86::CondCode & CC)27016 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
27017   // This combine only operates on CMP-like nodes.
27018   if (!(Cmp.getOpcode() == X86ISD::CMP ||
27019         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
27020     return SDValue();
27021 
27022   // Quit if not used as a boolean value.
27023   if (CC != X86::COND_E && CC != X86::COND_NE)
27024     return SDValue();
27025 
27026   // Check CMP operands. One of them should be 0 or 1 and the other should be
27027   // an SetCC or extended from it.
27028   SDValue Op1 = Cmp.getOperand(0);
27029   SDValue Op2 = Cmp.getOperand(1);
27030 
27031   SDValue SetCC;
27032   const ConstantSDNode* C = nullptr;
27033   bool needOppositeCond = (CC == X86::COND_E);
27034   bool checkAgainstTrue = false; // Is it a comparison against 1?
27035 
27036   if ((C = dyn_cast<ConstantSDNode>(Op1)))
27037     SetCC = Op2;
27038   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
27039     SetCC = Op1;
27040   else // Quit if all operands are not constants.
27041     return SDValue();
27042 
27043   if (C->getZExtValue() == 1) {
27044     needOppositeCond = !needOppositeCond;
27045     checkAgainstTrue = true;
27046   } else if (C->getZExtValue() != 0)
27047     // Quit if the constant is neither 0 or 1.
27048     return SDValue();
27049 
27050   bool truncatedToBoolWithAnd = false;
27051   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
27052   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
27053          SetCC.getOpcode() == ISD::TRUNCATE ||
27054          SetCC.getOpcode() == ISD::AssertZext ||
27055          SetCC.getOpcode() == ISD::AND) {
27056     if (SetCC.getOpcode() == ISD::AND) {
27057       int OpIdx = -1;
27058       if (isOneConstant(SetCC.getOperand(0)))
27059         OpIdx = 1;
27060       if (isOneConstant(SetCC.getOperand(1)))
27061         OpIdx = 0;
27062       if (OpIdx < 0)
27063         break;
27064       SetCC = SetCC.getOperand(OpIdx);
27065       truncatedToBoolWithAnd = true;
27066     } else
27067       SetCC = SetCC.getOperand(0);
27068   }
27069 
27070   switch (SetCC.getOpcode()) {
27071   case X86ISD::SETCC_CARRY:
27072     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
27073     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
27074     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
27075     // truncated to i1 using 'and'.
27076     if (checkAgainstTrue && !truncatedToBoolWithAnd)
27077       break;
27078     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
27079            "Invalid use of SETCC_CARRY!");
27080     // FALL THROUGH
27081   case X86ISD::SETCC:
27082     // Set the condition code or opposite one if necessary.
27083     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
27084     if (needOppositeCond)
27085       CC = X86::GetOppositeBranchCondition(CC);
27086     return SetCC.getOperand(1);
27087   case X86ISD::CMOV: {
27088     // Check whether false/true value has canonical one, i.e. 0 or 1.
27089     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
27090     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
27091     // Quit if true value is not a constant.
27092     if (!TVal)
27093       return SDValue();
27094     // Quit if false value is not a constant.
27095     if (!FVal) {
27096       SDValue Op = SetCC.getOperand(0);
27097       // Skip 'zext' or 'trunc' node.
27098       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
27099           Op.getOpcode() == ISD::TRUNCATE)
27100         Op = Op.getOperand(0);
27101       // A special case for rdrand/rdseed, where 0 is set if false cond is
27102       // found.
27103       if ((Op.getOpcode() != X86ISD::RDRAND &&
27104            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
27105         return SDValue();
27106     }
27107     // Quit if false value is not the constant 0 or 1.
27108     bool FValIsFalse = true;
27109     if (FVal && FVal->getZExtValue() != 0) {
27110       if (FVal->getZExtValue() != 1)
27111         return SDValue();
27112       // If FVal is 1, opposite cond is needed.
27113       needOppositeCond = !needOppositeCond;
27114       FValIsFalse = false;
27115     }
27116     // Quit if TVal is not the constant opposite of FVal.
27117     if (FValIsFalse && TVal->getZExtValue() != 1)
27118       return SDValue();
27119     if (!FValIsFalse && TVal->getZExtValue() != 0)
27120       return SDValue();
27121     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
27122     if (needOppositeCond)
27123       CC = X86::GetOppositeBranchCondition(CC);
27124     return SetCC.getOperand(3);
27125   }
27126   }
27127 
27128   return SDValue();
27129 }
27130 
27131 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
27132 /// Match:
27133 ///   (X86or (X86setcc) (X86setcc))
27134 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
checkBoolTestAndOrSetCCCombine(SDValue Cond,X86::CondCode & CC0,X86::CondCode & CC1,SDValue & Flags,bool & isAnd)27135 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
27136                                            X86::CondCode &CC1, SDValue &Flags,
27137                                            bool &isAnd) {
27138   if (Cond->getOpcode() == X86ISD::CMP) {
27139     if (!isNullConstant(Cond->getOperand(1)))
27140       return false;
27141 
27142     Cond = Cond->getOperand(0);
27143   }
27144 
27145   isAnd = false;
27146 
27147   SDValue SetCC0, SetCC1;
27148   switch (Cond->getOpcode()) {
27149   default: return false;
27150   case ISD::AND:
27151   case X86ISD::AND:
27152     isAnd = true;
27153     // fallthru
27154   case ISD::OR:
27155   case X86ISD::OR:
27156     SetCC0 = Cond->getOperand(0);
27157     SetCC1 = Cond->getOperand(1);
27158     break;
27159   };
27160 
27161   // Make sure we have SETCC nodes, using the same flags value.
27162   if (SetCC0.getOpcode() != X86ISD::SETCC ||
27163       SetCC1.getOpcode() != X86ISD::SETCC ||
27164       SetCC0->getOperand(1) != SetCC1->getOperand(1))
27165     return false;
27166 
27167   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
27168   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
27169   Flags = SetCC0->getOperand(1);
27170   return true;
27171 }
27172 
27173 /// Optimize an EFLAGS definition used according to the condition code \p CC
27174 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
27175 /// uses of chain values.
combineSetCCEFLAGS(SDValue EFLAGS,X86::CondCode & CC,SelectionDAG & DAG)27176 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
27177                                   SelectionDAG &DAG) {
27178   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
27179     return R;
27180   return combineSetCCAtomicArith(EFLAGS, CC, DAG);
27181 }
27182 
27183 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
combineCMov(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27184 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
27185                            TargetLowering::DAGCombinerInfo &DCI,
27186                            const X86Subtarget &Subtarget) {
27187   SDLoc DL(N);
27188 
27189   // If the flag operand isn't dead, don't touch this CMOV.
27190   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
27191     return SDValue();
27192 
27193   SDValue FalseOp = N->getOperand(0);
27194   SDValue TrueOp = N->getOperand(1);
27195   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
27196   SDValue Cond = N->getOperand(3);
27197 
27198   if (CC == X86::COND_E || CC == X86::COND_NE) {
27199     switch (Cond.getOpcode()) {
27200     default: break;
27201     case X86ISD::BSR:
27202     case X86ISD::BSF:
27203       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
27204       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
27205         return (CC == X86::COND_E) ? FalseOp : TrueOp;
27206     }
27207   }
27208 
27209   // Try to simplify the EFLAGS and condition code operands.
27210   // We can't always do this as FCMOV only supports a subset of X86 cond.
27211   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
27212     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
27213       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
27214         Flags};
27215       return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27216     }
27217   }
27218 
27219   // If this is a select between two integer constants, try to do some
27220   // optimizations.  Note that the operands are ordered the opposite of SELECT
27221   // operands.
27222   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
27223     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
27224       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
27225       // larger than FalseC (the false value).
27226       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
27227         CC = X86::GetOppositeBranchCondition(CC);
27228         std::swap(TrueC, FalseC);
27229         std::swap(TrueOp, FalseOp);
27230       }
27231 
27232       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
27233       // This is efficient for any integer data type (including i8/i16) and
27234       // shift amount.
27235       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
27236         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27237                            DAG.getConstant(CC, DL, MVT::i8), Cond);
27238 
27239         // Zero extend the condition if needed.
27240         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
27241 
27242         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
27243         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
27244                            DAG.getConstant(ShAmt, DL, MVT::i8));
27245         if (N->getNumValues() == 2)  // Dead flag value?
27246           return DCI.CombineTo(N, Cond, SDValue());
27247         return Cond;
27248       }
27249 
27250       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
27251       // for any integer data type, including i8/i16.
27252       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
27253         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27254                            DAG.getConstant(CC, DL, MVT::i8), Cond);
27255 
27256         // Zero extend the condition if needed.
27257         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
27258                            FalseC->getValueType(0), Cond);
27259         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27260                            SDValue(FalseC, 0));
27261 
27262         if (N->getNumValues() == 2)  // Dead flag value?
27263           return DCI.CombineTo(N, Cond, SDValue());
27264         return Cond;
27265       }
27266 
27267       // Optimize cases that will turn into an LEA instruction.  This requires
27268       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
27269       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
27270         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
27271         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
27272 
27273         bool isFastMultiplier = false;
27274         if (Diff < 10) {
27275           switch ((unsigned char)Diff) {
27276           default: break;
27277           case 1:  // result = add base, cond
27278           case 2:  // result = lea base(    , cond*2)
27279           case 3:  // result = lea base(cond, cond*2)
27280           case 4:  // result = lea base(    , cond*4)
27281           case 5:  // result = lea base(cond, cond*4)
27282           case 8:  // result = lea base(    , cond*8)
27283           case 9:  // result = lea base(cond, cond*8)
27284             isFastMultiplier = true;
27285             break;
27286           }
27287         }
27288 
27289         if (isFastMultiplier) {
27290           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
27291           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27292                              DAG.getConstant(CC, DL, MVT::i8), Cond);
27293           // Zero extend the condition if needed.
27294           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
27295                              Cond);
27296           // Scale the condition by the difference.
27297           if (Diff != 1)
27298             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
27299                                DAG.getConstant(Diff, DL, Cond.getValueType()));
27300 
27301           // Add the base if non-zero.
27302           if (FalseC->getAPIntValue() != 0)
27303             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27304                                SDValue(FalseC, 0));
27305           if (N->getNumValues() == 2)  // Dead flag value?
27306             return DCI.CombineTo(N, Cond, SDValue());
27307           return Cond;
27308         }
27309       }
27310     }
27311   }
27312 
27313   // Handle these cases:
27314   //   (select (x != c), e, c) -> select (x != c), e, x),
27315   //   (select (x == c), c, e) -> select (x == c), x, e)
27316   // where the c is an integer constant, and the "select" is the combination
27317   // of CMOV and CMP.
27318   //
27319   // The rationale for this change is that the conditional-move from a constant
27320   // needs two instructions, however, conditional-move from a register needs
27321   // only one instruction.
27322   //
27323   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
27324   //  some instruction-combining opportunities. This opt needs to be
27325   //  postponed as late as possible.
27326   //
27327   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
27328     // the DCI.xxxx conditions are provided to postpone the optimization as
27329     // late as possible.
27330 
27331     ConstantSDNode *CmpAgainst = nullptr;
27332     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
27333         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
27334         !isa<ConstantSDNode>(Cond.getOperand(0))) {
27335 
27336       if (CC == X86::COND_NE &&
27337           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
27338         CC = X86::GetOppositeBranchCondition(CC);
27339         std::swap(TrueOp, FalseOp);
27340       }
27341 
27342       if (CC == X86::COND_E &&
27343           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
27344         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
27345                           DAG.getConstant(CC, DL, MVT::i8), Cond };
27346         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
27347       }
27348     }
27349   }
27350 
27351   // Fold and/or of setcc's to double CMOV:
27352   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
27353   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
27354   //
27355   // This combine lets us generate:
27356   //   cmovcc1 (jcc1 if we don't have CMOV)
27357   //   cmovcc2 (same)
27358   // instead of:
27359   //   setcc1
27360   //   setcc2
27361   //   and/or
27362   //   cmovne (jne if we don't have CMOV)
27363   // When we can't use the CMOV instruction, it might increase branch
27364   // mispredicts.
27365   // When we can use CMOV, or when there is no mispredict, this improves
27366   // throughput and reduces register pressure.
27367   //
27368   if (CC == X86::COND_NE) {
27369     SDValue Flags;
27370     X86::CondCode CC0, CC1;
27371     bool isAndSetCC;
27372     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
27373       if (isAndSetCC) {
27374         std::swap(FalseOp, TrueOp);
27375         CC0 = X86::GetOppositeBranchCondition(CC0);
27376         CC1 = X86::GetOppositeBranchCondition(CC1);
27377       }
27378 
27379       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
27380         Flags};
27381       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
27382       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
27383       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27384       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
27385       return CMOV;
27386     }
27387   }
27388 
27389   return SDValue();
27390 }
27391 
27392 /// Different mul shrinking modes.
27393 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
27394 
canReduceVMulWidth(SDNode * N,SelectionDAG & DAG,ShrinkMode & Mode)27395 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
27396   EVT VT = N->getOperand(0).getValueType();
27397   if (VT.getScalarSizeInBits() != 32)
27398     return false;
27399 
27400   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
27401   unsigned SignBits[2] = {1, 1};
27402   bool IsPositive[2] = {false, false};
27403   for (unsigned i = 0; i < 2; i++) {
27404     SDValue Opd = N->getOperand(i);
27405 
27406     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
27407     // compute signbits for it separately.
27408     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
27409       // For anyextend, it is safe to assume an appropriate number of leading
27410       // sign/zero bits.
27411       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
27412         SignBits[i] = 25;
27413       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
27414                MVT::i16)
27415         SignBits[i] = 17;
27416       else
27417         return false;
27418       IsPositive[i] = true;
27419     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
27420       // All the operands of BUILD_VECTOR need to be int constant.
27421       // Find the smallest value range which all the operands belong to.
27422       SignBits[i] = 32;
27423       IsPositive[i] = true;
27424       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
27425         if (SubOp.isUndef())
27426           continue;
27427         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
27428         if (!CN)
27429           return false;
27430         APInt IntVal = CN->getAPIntValue();
27431         if (IntVal.isNegative())
27432           IsPositive[i] = false;
27433         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
27434       }
27435     } else {
27436       SignBits[i] = DAG.ComputeNumSignBits(Opd);
27437       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
27438         IsPositive[i] = true;
27439     }
27440   }
27441 
27442   bool AllPositive = IsPositive[0] && IsPositive[1];
27443   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
27444   // When ranges are from -128 ~ 127, use MULS8 mode.
27445   if (MinSignBits >= 25)
27446     Mode = MULS8;
27447   // When ranges are from 0 ~ 255, use MULU8 mode.
27448   else if (AllPositive && MinSignBits >= 24)
27449     Mode = MULU8;
27450   // When ranges are from -32768 ~ 32767, use MULS16 mode.
27451   else if (MinSignBits >= 17)
27452     Mode = MULS16;
27453   // When ranges are from 0 ~ 65535, use MULU16 mode.
27454   else if (AllPositive && MinSignBits >= 16)
27455     Mode = MULU16;
27456   else
27457     return false;
27458   return true;
27459 }
27460 
27461 /// When the operands of vector mul are extended from smaller size values,
27462 /// like i8 and i16, the type of mul may be shrinked to generate more
27463 /// efficient code. Two typical patterns are handled:
27464 /// Pattern1:
27465 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
27466 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
27467 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27468 ///     %5 = mul <N x i32> %2, %4
27469 ///
27470 /// Pattern2:
27471 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
27472 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
27473 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27474 ///     %5 = mul <N x i32> %2, %4
27475 ///
27476 /// There are four mul shrinking modes:
27477 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
27478 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
27479 /// generate pmullw+sext32 for it (MULS8 mode).
27480 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
27481 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
27482 /// generate pmullw+zext32 for it (MULU8 mode).
27483 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
27484 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
27485 /// generate pmullw+pmulhw for it (MULS16 mode).
27486 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
27487 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
27488 /// generate pmullw+pmulhuw for it (MULU16 mode).
reduceVMULWidth(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)27489 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
27490                                const X86Subtarget &Subtarget) {
27491   // pmulld is supported since SSE41. It is better to use pmulld
27492   // instead of pmullw+pmulhw.
27493   if (Subtarget.hasSSE41())
27494     return SDValue();
27495 
27496   ShrinkMode Mode;
27497   if (!canReduceVMulWidth(N, DAG, Mode))
27498     return SDValue();
27499 
27500   SDLoc DL(N);
27501   SDValue N0 = N->getOperand(0);
27502   SDValue N1 = N->getOperand(1);
27503   EVT VT = N->getOperand(0).getValueType();
27504   unsigned RegSize = 128;
27505   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
27506   EVT ReducedVT =
27507       EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
27508   // Shrink the operands of mul.
27509   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
27510   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
27511 
27512   if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
27513     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
27514     // lower part is needed.
27515     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
27516     if (Mode == MULU8 || Mode == MULS8) {
27517       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
27518                          DL, VT, MulLo);
27519     } else {
27520       MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27521       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
27522       // the higher part is also needed.
27523       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27524                                   ReducedVT, NewN0, NewN1);
27525 
27526       // Repack the lower part and higher part result of mul into a wider
27527       // result.
27528       // Generate shuffle functioning as punpcklwd.
27529       SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
27530       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27531         ShuffleMask[2 * i] = i;
27532         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
27533       }
27534       SDValue ResLo =
27535           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27536       ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
27537       // Generate shuffle functioning as punpckhwd.
27538       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27539         ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
27540         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
27541       }
27542       SDValue ResHi =
27543           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27544       ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
27545       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
27546     }
27547   } else {
27548     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
27549     // to legalize the mul explicitly because implicit legalization for type
27550     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
27551     // instructions which will not exist when we explicitly legalize it by
27552     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
27553     // <4 x i16> undef).
27554     //
27555     // Legalize the operands of mul.
27556     SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
27557                                  DAG.getUNDEF(ReducedVT));
27558     Ops[0] = NewN0;
27559     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27560     Ops[0] = NewN1;
27561     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27562 
27563     if (Mode == MULU8 || Mode == MULS8) {
27564       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
27565       // part is needed.
27566       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27567 
27568       // convert the type of mul result to VT.
27569       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27570       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
27571                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
27572                                 DL, ResVT, Mul);
27573       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27574                          DAG.getIntPtrConstant(0, DL));
27575     } else {
27576       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
27577       // MULU16/MULS16, both parts are needed.
27578       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27579       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27580                                   OpsVT, NewN0, NewN1);
27581 
27582       // Repack the lower part and higher part result of mul into a wider
27583       // result. Make sure the type of mul result is VT.
27584       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27585       SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
27586       Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
27587       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27588                          DAG.getIntPtrConstant(0, DL));
27589     }
27590   }
27591 }
27592 
27593 /// Optimize a single multiply with constant into two operations in order to
27594 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
combineMul(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27595 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
27596                           TargetLowering::DAGCombinerInfo &DCI,
27597                           const X86Subtarget &Subtarget) {
27598   EVT VT = N->getValueType(0);
27599   if (DCI.isBeforeLegalize() && VT.isVector())
27600     return reduceVMULWidth(N, DAG, Subtarget);
27601 
27602   // An imul is usually smaller than the alternative sequence.
27603   if (DAG.getMachineFunction().getFunction()->optForMinSize())
27604     return SDValue();
27605 
27606   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
27607     return SDValue();
27608 
27609   if (VT != MVT::i64 && VT != MVT::i32)
27610     return SDValue();
27611 
27612   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
27613   if (!C)
27614     return SDValue();
27615   uint64_t MulAmt = C->getZExtValue();
27616   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
27617     return SDValue();
27618 
27619   uint64_t MulAmt1 = 0;
27620   uint64_t MulAmt2 = 0;
27621   if ((MulAmt % 9) == 0) {
27622     MulAmt1 = 9;
27623     MulAmt2 = MulAmt / 9;
27624   } else if ((MulAmt % 5) == 0) {
27625     MulAmt1 = 5;
27626     MulAmt2 = MulAmt / 5;
27627   } else if ((MulAmt % 3) == 0) {
27628     MulAmt1 = 3;
27629     MulAmt2 = MulAmt / 3;
27630   }
27631 
27632   SDLoc DL(N);
27633   SDValue NewMul;
27634   if (MulAmt2 &&
27635       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
27636 
27637     if (isPowerOf2_64(MulAmt2) &&
27638         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
27639       // If second multiplifer is pow2, issue it first. We want the multiply by
27640       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
27641       // is an add.
27642       std::swap(MulAmt1, MulAmt2);
27643 
27644     if (isPowerOf2_64(MulAmt1))
27645       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27646                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
27647     else
27648       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
27649                            DAG.getConstant(MulAmt1, DL, VT));
27650 
27651     if (isPowerOf2_64(MulAmt2))
27652       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
27653                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
27654     else
27655       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
27656                            DAG.getConstant(MulAmt2, DL, VT));
27657   }
27658 
27659   if (!NewMul) {
27660     assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
27661            && "Both cases that could cause potential overflows should have "
27662               "already been handled.");
27663     if (isPowerOf2_64(MulAmt - 1))
27664       // (mul x, 2^N + 1) => (add (shl x, N), x)
27665       NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
27666                                 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27667                                 DAG.getConstant(Log2_64(MulAmt - 1), DL,
27668                                 MVT::i8)));
27669 
27670     else if (isPowerOf2_64(MulAmt + 1))
27671       // (mul x, 2^N - 1) => (sub (shl x, N), x)
27672       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
27673                                 N->getOperand(0),
27674                                 DAG.getConstant(Log2_64(MulAmt + 1),
27675                                 DL, MVT::i8)), N->getOperand(0));
27676   }
27677 
27678   if (NewMul)
27679     // Do not add new nodes to DAG combiner worklist.
27680     DCI.CombineTo(N, NewMul, false);
27681 
27682   return SDValue();
27683 }
27684 
combineShiftLeft(SDNode * N,SelectionDAG & DAG)27685 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
27686   SDValue N0 = N->getOperand(0);
27687   SDValue N1 = N->getOperand(1);
27688   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
27689   EVT VT = N0.getValueType();
27690 
27691   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
27692   // since the result of setcc_c is all zero's or all ones.
27693   if (VT.isInteger() && !VT.isVector() &&
27694       N1C && N0.getOpcode() == ISD::AND &&
27695       N0.getOperand(1).getOpcode() == ISD::Constant) {
27696     SDValue N00 = N0.getOperand(0);
27697     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
27698     const APInt &ShAmt = N1C->getAPIntValue();
27699     Mask = Mask.shl(ShAmt);
27700     bool MaskOK = false;
27701     // We can handle cases concerning bit-widening nodes containing setcc_c if
27702     // we carefully interrogate the mask to make sure we are semantics
27703     // preserving.
27704     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
27705     // of the underlying setcc_c operation if the setcc_c was zero extended.
27706     // Consider the following example:
27707     //   zext(setcc_c)                 -> i32 0x0000FFFF
27708     //   c1                            -> i32 0x0000FFFF
27709     //   c2                            -> i32 0x00000001
27710     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
27711     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
27712     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
27713       MaskOK = true;
27714     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
27715                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27716       MaskOK = true;
27717     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
27718                 N00.getOpcode() == ISD::ANY_EXTEND) &&
27719                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27720       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
27721     }
27722     if (MaskOK && Mask != 0) {
27723       SDLoc DL(N);
27724       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
27725     }
27726   }
27727 
27728   // Hardware support for vector shifts is sparse which makes us scalarize the
27729   // vector operations in many cases. Also, on sandybridge ADD is faster than
27730   // shl.
27731   // (shl V, 1) -> add V,V
27732   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
27733     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
27734       assert(N0.getValueType().isVector() && "Invalid vector shift type");
27735       // We shift all of the values by one. In many cases we do not have
27736       // hardware support for this operation. This is better expressed as an ADD
27737       // of two values.
27738       if (N1SplatC->getAPIntValue() == 1)
27739         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
27740     }
27741 
27742   return SDValue();
27743 }
27744 
combineShiftRightAlgebraic(SDNode * N,SelectionDAG & DAG)27745 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
27746   SDValue N0 = N->getOperand(0);
27747   SDValue N1 = N->getOperand(1);
27748   EVT VT = N0.getValueType();
27749   unsigned Size = VT.getSizeInBits();
27750 
27751   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
27752   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
27753   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
27754   // depending on sign of (SarConst - [56,48,32,24,16])
27755 
27756   // sexts in X86 are MOVs. The MOVs have the same code size
27757   // as above SHIFTs (only SHIFT on 1 has lower code size).
27758   // However the MOVs have 2 advantages to a SHIFT:
27759   // 1. MOVs can write to a register that differs from source
27760   // 2. MOVs accept memory operands
27761 
27762   if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
27763       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
27764       N0.getOperand(1).getOpcode() != ISD::Constant)
27765     return SDValue();
27766 
27767   SDValue N00 = N0.getOperand(0);
27768   SDValue N01 = N0.getOperand(1);
27769   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
27770   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
27771   EVT CVT = N1.getValueType();
27772 
27773   if (SarConst.isNegative())
27774     return SDValue();
27775 
27776   for (MVT SVT : MVT::integer_valuetypes()) {
27777     unsigned ShiftSize = SVT.getSizeInBits();
27778     // skipping types without corresponding sext/zext and
27779     // ShlConst that is not one of [56,48,32,24,16]
27780     if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
27781       continue;
27782     SDLoc DL(N);
27783     SDValue NN =
27784         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
27785     SarConst = SarConst - (Size - ShiftSize);
27786     if (SarConst == 0)
27787       return NN;
27788     else if (SarConst.isNegative())
27789       return DAG.getNode(ISD::SHL, DL, VT, NN,
27790                          DAG.getConstant(-SarConst, DL, CVT));
27791     else
27792       return DAG.getNode(ISD::SRA, DL, VT, NN,
27793                          DAG.getConstant(SarConst, DL, CVT));
27794   }
27795   return SDValue();
27796 }
27797 
27798 /// \brief Returns a vector of 0s if the node in input is a vector logical
27799 /// shift by a constant amount which is known to be bigger than or equal
27800 /// to the vector element size in bits.
performShiftToAllZeros(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)27801 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
27802                                       const X86Subtarget &Subtarget) {
27803   EVT VT = N->getValueType(0);
27804 
27805   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
27806       (!Subtarget.hasInt256() ||
27807        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
27808     return SDValue();
27809 
27810   SDValue Amt = N->getOperand(1);
27811   SDLoc DL(N);
27812   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
27813     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
27814       const APInt &ShiftAmt = AmtSplat->getAPIntValue();
27815       unsigned MaxAmount =
27816         VT.getSimpleVT().getVectorElementType().getSizeInBits();
27817 
27818       // SSE2/AVX2 logical shifts always return a vector of 0s
27819       // if the shift amount is bigger than or equal to
27820       // the element size. The constant shift amount will be
27821       // encoded as a 8-bit immediate.
27822       if (ShiftAmt.trunc(8).uge(MaxAmount))
27823         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
27824     }
27825 
27826   return SDValue();
27827 }
27828 
combineShift(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27829 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
27830                             TargetLowering::DAGCombinerInfo &DCI,
27831                             const X86Subtarget &Subtarget) {
27832   if (N->getOpcode() == ISD::SHL)
27833     if (SDValue V = combineShiftLeft(N, DAG))
27834       return V;
27835 
27836   if (N->getOpcode() == ISD::SRA)
27837     if (SDValue V = combineShiftRightAlgebraic(N, DAG))
27838       return V;
27839 
27840   // Try to fold this logical shift into a zero vector.
27841   if (N->getOpcode() != ISD::SRA)
27842     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
27843       return V;
27844 
27845   return SDValue();
27846 }
27847 
27848 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
27849 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
27850 /// OR -> CMPNEQSS.
combineCompareEqual(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27851 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
27852                                    TargetLowering::DAGCombinerInfo &DCI,
27853                                    const X86Subtarget &Subtarget) {
27854   unsigned opcode;
27855 
27856   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
27857   // we're requiring SSE2 for both.
27858   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
27859     SDValue N0 = N->getOperand(0);
27860     SDValue N1 = N->getOperand(1);
27861     SDValue CMP0 = N0->getOperand(1);
27862     SDValue CMP1 = N1->getOperand(1);
27863     SDLoc DL(N);
27864 
27865     // The SETCCs should both refer to the same CMP.
27866     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
27867       return SDValue();
27868 
27869     SDValue CMP00 = CMP0->getOperand(0);
27870     SDValue CMP01 = CMP0->getOperand(1);
27871     EVT     VT    = CMP00.getValueType();
27872 
27873     if (VT == MVT::f32 || VT == MVT::f64) {
27874       bool ExpectingFlags = false;
27875       // Check for any users that want flags:
27876       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
27877            !ExpectingFlags && UI != UE; ++UI)
27878         switch (UI->getOpcode()) {
27879         default:
27880         case ISD::BR_CC:
27881         case ISD::BRCOND:
27882         case ISD::SELECT:
27883           ExpectingFlags = true;
27884           break;
27885         case ISD::CopyToReg:
27886         case ISD::SIGN_EXTEND:
27887         case ISD::ZERO_EXTEND:
27888         case ISD::ANY_EXTEND:
27889           break;
27890         }
27891 
27892       if (!ExpectingFlags) {
27893         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
27894         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
27895 
27896         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
27897           X86::CondCode tmp = cc0;
27898           cc0 = cc1;
27899           cc1 = tmp;
27900         }
27901 
27902         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
27903             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
27904           // FIXME: need symbolic constants for these magic numbers.
27905           // See X86ATTInstPrinter.cpp:printSSECC().
27906           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
27907           if (Subtarget.hasAVX512()) {
27908             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
27909                                          CMP01,
27910                                          DAG.getConstant(x86cc, DL, MVT::i8));
27911             if (N->getValueType(0) != MVT::i1)
27912               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
27913                                  FSetCC);
27914             return FSetCC;
27915           }
27916           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
27917                                               CMP00.getValueType(), CMP00, CMP01,
27918                                               DAG.getConstant(x86cc, DL,
27919                                                               MVT::i8));
27920 
27921           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
27922           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
27923 
27924           if (is64BitFP && !Subtarget.is64Bit()) {
27925             // On a 32-bit target, we cannot bitcast the 64-bit float to a
27926             // 64-bit integer, since that's not a legal type. Since
27927             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
27928             // bits, but can do this little dance to extract the lowest 32 bits
27929             // and work with those going forward.
27930             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
27931                                            OnesOrZeroesF);
27932             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
27933             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
27934                                         Vector32, DAG.getIntPtrConstant(0, DL));
27935             IntVT = MVT::i32;
27936           }
27937 
27938           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
27939           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
27940                                       DAG.getConstant(1, DL, IntVT));
27941           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27942                                               ANDed);
27943           return OneBitOfTruth;
27944         }
27945       }
27946     }
27947   }
27948   return SDValue();
27949 }
27950 
27951 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
combineANDXORWithAllOnesIntoANDNP(SDNode * N,SelectionDAG & DAG)27952 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
27953   assert(N->getOpcode() == ISD::AND);
27954 
27955   EVT VT = N->getValueType(0);
27956   SDValue N0 = N->getOperand(0);
27957   SDValue N1 = N->getOperand(1);
27958   SDLoc DL(N);
27959 
27960   if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
27961       VT != MVT::v8i64 && VT != MVT::v16i32 &&
27962       VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
27963     return SDValue();
27964 
27965   // Canonicalize XOR to the left.
27966   if (N1.getOpcode() == ISD::XOR)
27967     std::swap(N0, N1);
27968 
27969   if (N0.getOpcode() != ISD::XOR)
27970     return SDValue();
27971 
27972   SDValue N00 = N0->getOperand(0);
27973   SDValue N01 = N0->getOperand(1);
27974 
27975   N01 = peekThroughBitcasts(N01);
27976 
27977   // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
27978   // insert_subvector building a 256-bit AllOnes vector.
27979   if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
27980     if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
27981       return SDValue();
27982 
27983     SDValue V1 = N01->getOperand(0);
27984     SDValue V2 = N01->getOperand(1);
27985     if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
27986         !V1.getOperand(0).isUndef() ||
27987         !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
27988         !ISD::isBuildVectorAllOnes(V2.getNode()))
27989       return SDValue();
27990   }
27991   return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
27992 }
27993 
27994 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
27995 // register. In most cases we actually compare or select YMM-sized registers
27996 // and mixing the two types creates horrible code. This method optimizes
27997 // some of the transition sequences.
WidenMaskArithmetic(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)27998 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
27999                                  TargetLowering::DAGCombinerInfo &DCI,
28000                                  const X86Subtarget &Subtarget) {
28001   EVT VT = N->getValueType(0);
28002   if (!VT.is256BitVector())
28003     return SDValue();
28004 
28005   assert((N->getOpcode() == ISD::ANY_EXTEND ||
28006           N->getOpcode() == ISD::ZERO_EXTEND ||
28007           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
28008 
28009   SDValue Narrow = N->getOperand(0);
28010   EVT NarrowVT = Narrow->getValueType(0);
28011   if (!NarrowVT.is128BitVector())
28012     return SDValue();
28013 
28014   if (Narrow->getOpcode() != ISD::XOR &&
28015       Narrow->getOpcode() != ISD::AND &&
28016       Narrow->getOpcode() != ISD::OR)
28017     return SDValue();
28018 
28019   SDValue N0  = Narrow->getOperand(0);
28020   SDValue N1  = Narrow->getOperand(1);
28021   SDLoc DL(Narrow);
28022 
28023   // The Left side has to be a trunc.
28024   if (N0.getOpcode() != ISD::TRUNCATE)
28025     return SDValue();
28026 
28027   // The type of the truncated inputs.
28028   EVT WideVT = N0->getOperand(0)->getValueType(0);
28029   if (WideVT != VT)
28030     return SDValue();
28031 
28032   // The right side has to be a 'trunc' or a constant vector.
28033   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
28034   ConstantSDNode *RHSConstSplat = nullptr;
28035   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
28036     RHSConstSplat = RHSBV->getConstantSplatNode();
28037   if (!RHSTrunc && !RHSConstSplat)
28038     return SDValue();
28039 
28040   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28041 
28042   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
28043     return SDValue();
28044 
28045   // Set N0 and N1 to hold the inputs to the new wide operation.
28046   N0 = N0->getOperand(0);
28047   if (RHSConstSplat) {
28048     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
28049                      SDValue(RHSConstSplat, 0));
28050     N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
28051   } else if (RHSTrunc) {
28052     N1 = N1->getOperand(0);
28053   }
28054 
28055   // Generate the wide operation.
28056   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
28057   unsigned Opcode = N->getOpcode();
28058   switch (Opcode) {
28059   case ISD::ANY_EXTEND:
28060     return Op;
28061   case ISD::ZERO_EXTEND: {
28062     unsigned InBits = NarrowVT.getScalarSizeInBits();
28063     APInt Mask = APInt::getAllOnesValue(InBits);
28064     Mask = Mask.zext(VT.getScalarSizeInBits());
28065     return DAG.getNode(ISD::AND, DL, VT,
28066                        Op, DAG.getConstant(Mask, DL, VT));
28067   }
28068   case ISD::SIGN_EXTEND:
28069     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
28070                        Op, DAG.getValueType(NarrowVT));
28071   default:
28072     llvm_unreachable("Unexpected opcode");
28073   }
28074 }
28075 
combineVectorZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28076 static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
28077                                  TargetLowering::DAGCombinerInfo &DCI,
28078                                  const X86Subtarget &Subtarget) {
28079   SDValue N0 = N->getOperand(0);
28080   SDValue N1 = N->getOperand(1);
28081   SDLoc DL(N);
28082 
28083   // A vector zext_in_reg may be represented as a shuffle,
28084   // feeding into a bitcast (this represents anyext) feeding into
28085   // an and with a mask.
28086   // We'd like to try to combine that into a shuffle with zero
28087   // plus a bitcast, removing the and.
28088   if (N0.getOpcode() != ISD::BITCAST ||
28089       N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
28090     return SDValue();
28091 
28092   // The other side of the AND should be a splat of 2^C, where C
28093   // is the number of bits in the source type.
28094   N1 = peekThroughBitcasts(N1);
28095   if (N1.getOpcode() != ISD::BUILD_VECTOR)
28096     return SDValue();
28097   BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
28098 
28099   ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
28100   EVT SrcType = Shuffle->getValueType(0);
28101 
28102   // We expect a single-source shuffle
28103   if (!Shuffle->getOperand(1)->isUndef())
28104     return SDValue();
28105 
28106   unsigned SrcSize = SrcType.getScalarSizeInBits();
28107   unsigned NumElems = SrcType.getVectorNumElements();
28108 
28109   APInt SplatValue, SplatUndef;
28110   unsigned SplatBitSize;
28111   bool HasAnyUndefs;
28112   if (!Vector->isConstantSplat(SplatValue, SplatUndef,
28113                                 SplatBitSize, HasAnyUndefs))
28114     return SDValue();
28115 
28116   unsigned ResSize = N1.getValueType().getScalarSizeInBits();
28117   // Make sure the splat matches the mask we expect
28118   if (SplatBitSize > ResSize ||
28119       (SplatValue + 1).exactLogBase2() != (int)SrcSize)
28120     return SDValue();
28121 
28122   // Make sure the input and output size make sense
28123   if (SrcSize >= ResSize || ResSize % SrcSize)
28124     return SDValue();
28125 
28126   // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
28127   // The number of u's between each two values depends on the ratio between
28128   // the source and dest type.
28129   unsigned ZextRatio = ResSize / SrcSize;
28130   bool IsZext = true;
28131   for (unsigned i = 0; i != NumElems; ++i) {
28132     if (i % ZextRatio) {
28133       if (Shuffle->getMaskElt(i) > 0) {
28134         // Expected undef
28135         IsZext = false;
28136         break;
28137       }
28138     } else {
28139       if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
28140         // Expected element number
28141         IsZext = false;
28142         break;
28143       }
28144     }
28145   }
28146 
28147   if (!IsZext)
28148     return SDValue();
28149 
28150   // Ok, perform the transformation - replace the shuffle with
28151   // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
28152   // (instead of undef) where the k elements come from the zero vector.
28153   SmallVector<int, 8> Mask;
28154   for (unsigned i = 0; i != NumElems; ++i)
28155     if (i % ZextRatio)
28156       Mask.push_back(NumElems);
28157     else
28158       Mask.push_back(i / ZextRatio);
28159 
28160   SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
28161     Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
28162   return DAG.getBitcast(N0.getValueType(), NewShuffle);
28163 }
28164 
28165 /// If both input operands of a logic op are being cast from floating point
28166 /// types, try to convert this into a floating point logic node to avoid
28167 /// unnecessary moves from SSE to integer registers.
convertIntLogicToFPLogic(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)28168 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
28169                                         const X86Subtarget &Subtarget) {
28170   unsigned FPOpcode = ISD::DELETED_NODE;
28171   if (N->getOpcode() == ISD::AND)
28172     FPOpcode = X86ISD::FAND;
28173   else if (N->getOpcode() == ISD::OR)
28174     FPOpcode = X86ISD::FOR;
28175   else if (N->getOpcode() == ISD::XOR)
28176     FPOpcode = X86ISD::FXOR;
28177 
28178   assert(FPOpcode != ISD::DELETED_NODE &&
28179          "Unexpected input node for FP logic conversion");
28180 
28181   EVT VT = N->getValueType(0);
28182   SDValue N0 = N->getOperand(0);
28183   SDValue N1 = N->getOperand(1);
28184   SDLoc DL(N);
28185   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
28186       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
28187        (Subtarget.hasSSE2() && VT == MVT::i64))) {
28188     SDValue N00 = N0.getOperand(0);
28189     SDValue N10 = N1.getOperand(0);
28190     EVT N00Type = N00.getValueType();
28191     EVT N10Type = N10.getValueType();
28192     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
28193       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
28194       return DAG.getBitcast(VT, FPLogic);
28195     }
28196   }
28197   return SDValue();
28198 }
28199 
28200 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
28201 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
28202 /// eliminate loading the vector constant mask value. This relies on the fact
28203 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
combinePCMPAnd1(SDNode * N,SelectionDAG & DAG)28204 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
28205   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
28206   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
28207 
28208   // TODO: Use AssertSext to mark any nodes that have the property of producing
28209   // all-ones or all-zeros. Then check for that node rather than particular
28210   // opcodes.
28211   if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
28212     return SDValue();
28213 
28214   // The existence of the PCMP node guarantees that we have the required SSE2 or
28215   // AVX2 for a shift of this vector type, but there is no vector shift by
28216   // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
28217   // masked compare nodes, so they should not make it here.
28218   EVT VT0 = Op0.getValueType();
28219   EVT VT1 = Op1.getValueType();
28220   unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
28221   if (VT0 != VT1 || EltBitWidth == 8)
28222     return SDValue();
28223 
28224   assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
28225 
28226   APInt SplatVal;
28227   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
28228     return SDValue();
28229 
28230   SDLoc DL(N);
28231   SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
28232   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
28233   return DAG.getBitcast(N->getValueType(0), Shift);
28234 }
28235 
combineAnd(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28236 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
28237                           TargetLowering::DAGCombinerInfo &DCI,
28238                           const X86Subtarget &Subtarget) {
28239   if (DCI.isBeforeLegalizeOps())
28240     return SDValue();
28241 
28242   if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
28243     return Zext;
28244 
28245   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28246     return R;
28247 
28248   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28249     return FPLogic;
28250 
28251   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
28252     return R;
28253 
28254   if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
28255     return ShiftRight;
28256 
28257   EVT VT = N->getValueType(0);
28258   SDValue N0 = N->getOperand(0);
28259   SDValue N1 = N->getOperand(1);
28260   SDLoc DL(N);
28261 
28262   // Create BEXTR instructions
28263   // BEXTR is ((X >> imm) & (2**size-1))
28264   if (VT != MVT::i32 && VT != MVT::i64)
28265     return SDValue();
28266 
28267   if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
28268     return SDValue();
28269   if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
28270     return SDValue();
28271 
28272   ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
28273   ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
28274   if (MaskNode && ShiftNode) {
28275     uint64_t Mask = MaskNode->getZExtValue();
28276     uint64_t Shift = ShiftNode->getZExtValue();
28277     if (isMask_64(Mask)) {
28278       uint64_t MaskSize = countPopulation(Mask);
28279       if (Shift + MaskSize <= VT.getSizeInBits())
28280         return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
28281                            DAG.getConstant(Shift | (MaskSize << 8), DL,
28282                                            VT));
28283     }
28284   }
28285   return SDValue();
28286 }
28287 
28288 // Try to fold:
28289 //   (or (and (m, y), (pandn m, x)))
28290 // into:
28291 //   (vselect m, x, y)
28292 // As a special case, try to fold:
28293 //   (or (and (m, (sub 0, x)), (pandn m, x)))
28294 // into:
28295 //   (sub (xor X, M), M)
combineLogicBlendIntoPBLENDV(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)28296 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
28297                                             const X86Subtarget &Subtarget) {
28298   assert(N->getOpcode() == ISD::OR);
28299 
28300   SDValue N0 = N->getOperand(0);
28301   SDValue N1 = N->getOperand(1);
28302   EVT VT = N->getValueType(0);
28303 
28304   if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
28305     return SDValue();
28306   assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
28307 
28308   // Canonicalize pandn to RHS
28309   if (N0.getOpcode() == X86ISD::ANDNP)
28310     std::swap(N0, N1);
28311 
28312   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
28313     return SDValue();
28314 
28315   SDValue Mask = N1.getOperand(0);
28316   SDValue X = N1.getOperand(1);
28317   SDValue Y;
28318   if (N0.getOperand(0) == Mask)
28319     Y = N0.getOperand(1);
28320   if (N0.getOperand(1) == Mask)
28321     Y = N0.getOperand(0);
28322 
28323   // Check to see if the mask appeared in both the AND and ANDNP.
28324   if (!Y.getNode())
28325     return SDValue();
28326 
28327   // Validate that X, Y, and Mask are bitcasts, and see through them.
28328   Mask = peekThroughBitcasts(Mask);
28329   X = peekThroughBitcasts(X);
28330   Y = peekThroughBitcasts(Y);
28331 
28332   EVT MaskVT = Mask.getValueType();
28333 
28334   // Validate that the Mask operand is a vector sra node.
28335   // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
28336   // there is no psrai.b
28337   unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
28338   unsigned SraAmt = ~0;
28339   if (Mask.getOpcode() == ISD::SRA) {
28340     if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
28341       if (auto *AmtConst = AmtBV->getConstantSplatNode())
28342         SraAmt = AmtConst->getZExtValue();
28343   } else if (Mask.getOpcode() == X86ISD::VSRAI) {
28344     SDValue SraC = Mask.getOperand(1);
28345     SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
28346   }
28347   if ((SraAmt + 1) != EltBits)
28348     return SDValue();
28349 
28350   SDLoc DL(N);
28351 
28352   // Try to match:
28353   //   (or (and (M, (sub 0, X)), (pandn M, X)))
28354   // which is a special case of vselect:
28355   //   (vselect M, (sub 0, X), X)
28356   // Per:
28357   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
28358   // We know that, if fNegate is 0 or 1:
28359   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
28360   //
28361   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
28362   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
28363   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
28364   // This lets us transform our vselect to:
28365   //   (add (xor X, M), (and M, 1))
28366   // And further to:
28367   //   (sub (xor X, M), M)
28368   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
28369     auto IsNegV = [](SDNode *N, SDValue V) {
28370       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
28371         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
28372     };
28373     SDValue V;
28374     if (IsNegV(Y.getNode(), X))
28375       V = X;
28376     else if (IsNegV(X.getNode(), Y))
28377       V = Y;
28378 
28379     if (V) {
28380       assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
28381       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
28382       SDValue SubOp2 = Mask;
28383 
28384       // If the negate was on the false side of the select, then
28385       // the operands of the SUB need to be swapped. PR 27251.
28386       // This is because the pattern being matched above is
28387       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
28388       // but if the pattern matched was
28389       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
28390       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
28391       // pattern also needs to be a negation of the replacement pattern above.
28392       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
28393       // sub accomplishes the negation of the replacement pattern.
28394       if (V == Y)
28395          std::swap(SubOp1, SubOp2);
28396 
28397       return DAG.getBitcast(VT,
28398                             DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
28399     }
28400   }
28401 
28402   // PBLENDVB is only available on SSE 4.1.
28403   if (!Subtarget.hasSSE41())
28404     return SDValue();
28405 
28406   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
28407 
28408   X = DAG.getBitcast(BlendVT, X);
28409   Y = DAG.getBitcast(BlendVT, Y);
28410   Mask = DAG.getBitcast(BlendVT, Mask);
28411   Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
28412   return DAG.getBitcast(VT, Mask);
28413 }
28414 
combineOr(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28415 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
28416                          TargetLowering::DAGCombinerInfo &DCI,
28417                          const X86Subtarget &Subtarget) {
28418   if (DCI.isBeforeLegalizeOps())
28419     return SDValue();
28420 
28421   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28422     return R;
28423 
28424   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28425     return FPLogic;
28426 
28427   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
28428     return R;
28429 
28430   SDValue N0 = N->getOperand(0);
28431   SDValue N1 = N->getOperand(1);
28432   EVT VT = N->getValueType(0);
28433 
28434   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
28435     return SDValue();
28436 
28437   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
28438   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
28439 
28440   // SHLD/SHRD instructions have lower register pressure, but on some
28441   // platforms they have higher latency than the equivalent
28442   // series of shifts/or that would otherwise be generated.
28443   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
28444   // have higher latencies and we are not optimizing for size.
28445   if (!OptForSize && Subtarget.isSHLDSlow())
28446     return SDValue();
28447 
28448   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
28449     std::swap(N0, N1);
28450   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
28451     return SDValue();
28452   if (!N0.hasOneUse() || !N1.hasOneUse())
28453     return SDValue();
28454 
28455   SDValue ShAmt0 = N0.getOperand(1);
28456   if (ShAmt0.getValueType() != MVT::i8)
28457     return SDValue();
28458   SDValue ShAmt1 = N1.getOperand(1);
28459   if (ShAmt1.getValueType() != MVT::i8)
28460     return SDValue();
28461   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
28462     ShAmt0 = ShAmt0.getOperand(0);
28463   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
28464     ShAmt1 = ShAmt1.getOperand(0);
28465 
28466   SDLoc DL(N);
28467   unsigned Opc = X86ISD::SHLD;
28468   SDValue Op0 = N0.getOperand(0);
28469   SDValue Op1 = N1.getOperand(0);
28470   if (ShAmt0.getOpcode() == ISD::SUB) {
28471     Opc = X86ISD::SHRD;
28472     std::swap(Op0, Op1);
28473     std::swap(ShAmt0, ShAmt1);
28474   }
28475 
28476   unsigned Bits = VT.getSizeInBits();
28477   if (ShAmt1.getOpcode() == ISD::SUB) {
28478     SDValue Sum = ShAmt1.getOperand(0);
28479     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
28480       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
28481       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
28482         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
28483       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
28484         return DAG.getNode(Opc, DL, VT,
28485                            Op0, Op1,
28486                            DAG.getNode(ISD::TRUNCATE, DL,
28487                                        MVT::i8, ShAmt0));
28488     }
28489   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
28490     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
28491     if (ShAmt0C &&
28492         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
28493       return DAG.getNode(Opc, DL, VT,
28494                          N0.getOperand(0), N1.getOperand(0),
28495                          DAG.getNode(ISD::TRUNCATE, DL,
28496                                        MVT::i8, ShAmt0));
28497   }
28498 
28499   return SDValue();
28500 }
28501 
28502 // Generate NEG and CMOV for integer abs.
combineIntegerAbs(SDNode * N,SelectionDAG & DAG)28503 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
28504   EVT VT = N->getValueType(0);
28505 
28506   // Since X86 does not have CMOV for 8-bit integer, we don't convert
28507   // 8-bit integer abs to NEG and CMOV.
28508   if (VT.isInteger() && VT.getSizeInBits() == 8)
28509     return SDValue();
28510 
28511   SDValue N0 = N->getOperand(0);
28512   SDValue N1 = N->getOperand(1);
28513   SDLoc DL(N);
28514 
28515   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
28516   // and change it to SUB and CMOV.
28517   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
28518       N0.getOpcode() == ISD::ADD &&
28519       N0.getOperand(1) == N1 &&
28520       N1.getOpcode() == ISD::SRA &&
28521       N1.getOperand(0) == N0.getOperand(0))
28522     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
28523       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
28524         // Generate SUB & CMOV.
28525         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28526                                   DAG.getConstant(0, DL, VT), N0.getOperand(0));
28527 
28528         SDValue Ops[] = { N0.getOperand(0), Neg,
28529                           DAG.getConstant(X86::COND_GE, DL, MVT::i8),
28530                           SDValue(Neg.getNode(), 1) };
28531         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
28532       }
28533   return SDValue();
28534 }
28535 
28536 /// Try to turn tests against the signbit in the form of:
28537 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
28538 /// into:
28539 ///   SETGT(X, -1)
foldXorTruncShiftIntoCmp(SDNode * N,SelectionDAG & DAG)28540 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
28541   // This is only worth doing if the output type is i8 or i1.
28542   EVT ResultType = N->getValueType(0);
28543   if (ResultType != MVT::i8 && ResultType != MVT::i1)
28544     return SDValue();
28545 
28546   SDValue N0 = N->getOperand(0);
28547   SDValue N1 = N->getOperand(1);
28548 
28549   // We should be performing an xor against a truncated shift.
28550   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
28551     return SDValue();
28552 
28553   // Make sure we are performing an xor against one.
28554   if (!isOneConstant(N1))
28555     return SDValue();
28556 
28557   // SetCC on x86 zero extends so only act on this if it's a logical shift.
28558   SDValue Shift = N0.getOperand(0);
28559   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
28560     return SDValue();
28561 
28562   // Make sure we are truncating from one of i16, i32 or i64.
28563   EVT ShiftTy = Shift.getValueType();
28564   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
28565     return SDValue();
28566 
28567   // Make sure the shift amount extracts the sign bit.
28568   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
28569       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
28570     return SDValue();
28571 
28572   // Create a greater-than comparison against -1.
28573   // N.B. Using SETGE against 0 works but we want a canonical looking
28574   // comparison, using SETGT matches up with what TranslateX86CC.
28575   SDLoc DL(N);
28576   SDValue ShiftOp = Shift.getOperand(0);
28577   EVT ShiftOpTy = ShiftOp.getValueType();
28578   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28579   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
28580                                                *DAG.getContext(), ResultType);
28581   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
28582                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
28583   if (SetCCResultType != ResultType)
28584     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
28585   return Cond;
28586 }
28587 
28588 /// Turn vector tests of the signbit in the form of:
28589 ///   xor (sra X, elt_size(X)-1), -1
28590 /// into:
28591 ///   pcmpgt X, -1
28592 ///
28593 /// This should be called before type legalization because the pattern may not
28594 /// persist after that.
foldVectorXorShiftIntoCmp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)28595 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
28596                                          const X86Subtarget &Subtarget) {
28597   EVT VT = N->getValueType(0);
28598   if (!VT.isSimple())
28599     return SDValue();
28600 
28601   switch (VT.getSimpleVT().SimpleTy) {
28602   default: return SDValue();
28603   case MVT::v16i8:
28604   case MVT::v8i16:
28605   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
28606   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
28607   case MVT::v32i8:
28608   case MVT::v16i16:
28609   case MVT::v8i32:
28610   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
28611   }
28612 
28613   // There must be a shift right algebraic before the xor, and the xor must be a
28614   // 'not' operation.
28615   SDValue Shift = N->getOperand(0);
28616   SDValue Ones = N->getOperand(1);
28617   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
28618       !ISD::isBuildVectorAllOnes(Ones.getNode()))
28619     return SDValue();
28620 
28621   // The shift should be smearing the sign bit across each vector element.
28622   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
28623   if (!ShiftBV)
28624     return SDValue();
28625 
28626   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
28627   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
28628   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
28629     return SDValue();
28630 
28631   // Create a greater-than comparison against -1. We don't use the more obvious
28632   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
28633   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
28634 }
28635 
combineXor(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28636 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
28637                                  TargetLowering::DAGCombinerInfo &DCI,
28638                                  const X86Subtarget &Subtarget) {
28639   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
28640     return Cmp;
28641 
28642   if (DCI.isBeforeLegalizeOps())
28643     return SDValue();
28644 
28645   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
28646     return RV;
28647 
28648   if (Subtarget.hasCMov())
28649     if (SDValue RV = combineIntegerAbs(N, DAG))
28650       return RV;
28651 
28652   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28653     return FPLogic;
28654 
28655   return SDValue();
28656 }
28657 
28658 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
28659 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
28660 /// X86ISD::AVG instruction.
detectAVGPattern(SDValue In,EVT VT,SelectionDAG & DAG,const X86Subtarget & Subtarget,const SDLoc & DL)28661 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
28662                                 const X86Subtarget &Subtarget,
28663                                 const SDLoc &DL) {
28664   if (!VT.isVector() || !VT.isSimple())
28665     return SDValue();
28666   EVT InVT = In.getValueType();
28667   unsigned NumElems = VT.getVectorNumElements();
28668 
28669   EVT ScalarVT = VT.getVectorElementType();
28670   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
28671         isPowerOf2_32(NumElems)))
28672     return SDValue();
28673 
28674   // InScalarVT is the intermediate type in AVG pattern and it should be greater
28675   // than the original input type (i8/i16).
28676   EVT InScalarVT = InVT.getVectorElementType();
28677   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
28678     return SDValue();
28679 
28680   if (!Subtarget.hasSSE2())
28681     return SDValue();
28682   if (Subtarget.hasAVX512()) {
28683     if (VT.getSizeInBits() > 512)
28684       return SDValue();
28685   } else if (Subtarget.hasAVX2()) {
28686     if (VT.getSizeInBits() > 256)
28687       return SDValue();
28688   } else {
28689     if (VT.getSizeInBits() > 128)
28690       return SDValue();
28691   }
28692 
28693   // Detect the following pattern:
28694   //
28695   //   %1 = zext <N x i8> %a to <N x i32>
28696   //   %2 = zext <N x i8> %b to <N x i32>
28697   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
28698   //   %4 = add nuw nsw <N x i32> %3, %2
28699   //   %5 = lshr <N x i32> %N, <i32 1 x N>
28700   //   %6 = trunc <N x i32> %5 to <N x i8>
28701   //
28702   // In AVX512, the last instruction can also be a trunc store.
28703 
28704   if (In.getOpcode() != ISD::SRL)
28705     return SDValue();
28706 
28707   // A lambda checking the given SDValue is a constant vector and each element
28708   // is in the range [Min, Max].
28709   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
28710     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
28711     if (!BV || !BV->isConstant())
28712       return false;
28713     for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
28714       ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
28715       if (!C)
28716         return false;
28717       uint64_t Val = C->getZExtValue();
28718       if (Val < Min || Val > Max)
28719         return false;
28720     }
28721     return true;
28722   };
28723 
28724   // Check if each element of the vector is left-shifted by one.
28725   auto LHS = In.getOperand(0);
28726   auto RHS = In.getOperand(1);
28727   if (!IsConstVectorInRange(RHS, 1, 1))
28728     return SDValue();
28729   if (LHS.getOpcode() != ISD::ADD)
28730     return SDValue();
28731 
28732   // Detect a pattern of a + b + 1 where the order doesn't matter.
28733   SDValue Operands[3];
28734   Operands[0] = LHS.getOperand(0);
28735   Operands[1] = LHS.getOperand(1);
28736 
28737   // Take care of the case when one of the operands is a constant vector whose
28738   // element is in the range [1, 256].
28739   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
28740       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
28741       Operands[0].getOperand(0).getValueType() == VT) {
28742     // The pattern is detected. Subtract one from the constant vector, then
28743     // demote it and emit X86ISD::AVG instruction.
28744     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
28745     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
28746     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
28747     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28748                        Operands[1]);
28749   }
28750 
28751   if (Operands[0].getOpcode() == ISD::ADD)
28752     std::swap(Operands[0], Operands[1]);
28753   else if (Operands[1].getOpcode() != ISD::ADD)
28754     return SDValue();
28755   Operands[2] = Operands[1].getOperand(0);
28756   Operands[1] = Operands[1].getOperand(1);
28757 
28758   // Now we have three operands of two additions. Check that one of them is a
28759   // constant vector with ones, and the other two are promoted from i8/i16.
28760   for (int i = 0; i < 3; ++i) {
28761     if (!IsConstVectorInRange(Operands[i], 1, 1))
28762       continue;
28763     std::swap(Operands[i], Operands[2]);
28764 
28765     // Check if Operands[0] and Operands[1] are results of type promotion.
28766     for (int j = 0; j < 2; ++j)
28767       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
28768           Operands[j].getOperand(0).getValueType() != VT)
28769         return SDValue();
28770 
28771     // The pattern is detected, emit X86ISD::AVG instruction.
28772     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28773                        Operands[1].getOperand(0));
28774   }
28775 
28776   return SDValue();
28777 }
28778 
combineLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28779 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
28780                            TargetLowering::DAGCombinerInfo &DCI,
28781                            const X86Subtarget &Subtarget) {
28782   LoadSDNode *Ld = cast<LoadSDNode>(N);
28783   EVT RegVT = Ld->getValueType(0);
28784   EVT MemVT = Ld->getMemoryVT();
28785   SDLoc dl(Ld);
28786   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28787 
28788   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
28789   // into two 16-byte operations.
28790   ISD::LoadExtType Ext = Ld->getExtensionType();
28791   bool Fast;
28792   unsigned AddressSpace = Ld->getAddressSpace();
28793   unsigned Alignment = Ld->getAlignment();
28794   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
28795       Ext == ISD::NON_EXTLOAD &&
28796       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
28797                              AddressSpace, Alignment, &Fast) && !Fast) {
28798     unsigned NumElems = RegVT.getVectorNumElements();
28799     if (NumElems < 2)
28800       return SDValue();
28801 
28802     SDValue Ptr = Ld->getBasePtr();
28803 
28804     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
28805                                   NumElems/2);
28806     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
28807                                 Ld->getPointerInfo(), Ld->isVolatile(),
28808                                 Ld->isNonTemporal(), Ld->isInvariant(),
28809                                 Alignment);
28810 
28811     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
28812     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
28813                                 Ld->getPointerInfo(), Ld->isVolatile(),
28814                                 Ld->isNonTemporal(), Ld->isInvariant(),
28815                                 std::min(16U, Alignment));
28816     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
28817                              Load1.getValue(1),
28818                              Load2.getValue(1));
28819 
28820     SDValue NewVec = DAG.getUNDEF(RegVT);
28821     NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
28822     NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
28823     return DCI.CombineTo(N, NewVec, TF, true);
28824   }
28825 
28826   return SDValue();
28827 }
28828 
28829 /// If V is a build vector of boolean constants and exactly one of those
28830 /// constants is true, return the operand index of that true element.
28831 /// Otherwise, return -1.
getOneTrueElt(SDValue V)28832 static int getOneTrueElt(SDValue V) {
28833   // This needs to be a build vector of booleans.
28834   // TODO: Checking for the i1 type matches the IR definition for the mask,
28835   // but the mask check could be loosened to i8 or other types. That might
28836   // also require checking more than 'allOnesValue'; eg, the x86 HW
28837   // instructions only require that the MSB is set for each mask element.
28838   // The ISD::MSTORE comments/definition do not specify how the mask operand
28839   // is formatted.
28840   auto *BV = dyn_cast<BuildVectorSDNode>(V);
28841   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
28842     return -1;
28843 
28844   int TrueIndex = -1;
28845   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
28846   for (unsigned i = 0; i < NumElts; ++i) {
28847     const SDValue &Op = BV->getOperand(i);
28848     if (Op.isUndef())
28849       continue;
28850     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
28851     if (!ConstNode)
28852       return -1;
28853     if (ConstNode->getAPIntValue().isAllOnesValue()) {
28854       // If we already found a one, this is too many.
28855       if (TrueIndex >= 0)
28856         return -1;
28857       TrueIndex = i;
28858     }
28859   }
28860   return TrueIndex;
28861 }
28862 
28863 /// Given a masked memory load/store operation, return true if it has one mask
28864 /// bit set. If it has one mask bit set, then also return the memory address of
28865 /// the scalar element to load/store, the vector index to insert/extract that
28866 /// scalar element, and the alignment for the scalar memory access.
getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode * MaskedOp,SelectionDAG & DAG,SDValue & Addr,SDValue & Index,unsigned & Alignment)28867 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
28868                                          SelectionDAG &DAG, SDValue &Addr,
28869                                          SDValue &Index, unsigned &Alignment) {
28870   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
28871   if (TrueMaskElt < 0)
28872     return false;
28873 
28874   // Get the address of the one scalar element that is specified by the mask
28875   // using the appropriate offset from the base pointer.
28876   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
28877   Addr = MaskedOp->getBasePtr();
28878   if (TrueMaskElt != 0) {
28879     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
28880     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
28881   }
28882 
28883   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
28884   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
28885   return true;
28886 }
28887 
28888 /// If exactly one element of the mask is set for a non-extending masked load,
28889 /// it is a scalar load and vector insert.
28890 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
28891 /// mask have already been optimized in IR, so we don't bother with those here.
28892 static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)28893 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28894                              TargetLowering::DAGCombinerInfo &DCI) {
28895   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
28896   // However, some target hooks may need to be added to know when the transform
28897   // is profitable. Endianness would also have to be considered.
28898 
28899   SDValue Addr, VecIndex;
28900   unsigned Alignment;
28901   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
28902     return SDValue();
28903 
28904   // Load the one scalar element that is specified by the mask using the
28905   // appropriate offset from the base pointer.
28906   SDLoc DL(ML);
28907   EVT VT = ML->getValueType(0);
28908   EVT EltVT = VT.getVectorElementType();
28909   SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
28910                              ML->getPointerInfo(), ML->isVolatile(),
28911                              ML->isNonTemporal(), ML->isInvariant(), Alignment);
28912 
28913   // Insert the loaded element into the appropriate place in the vector.
28914   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
28915                                Load, VecIndex);
28916   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
28917 }
28918 
28919 static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode * ML,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)28920 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28921                               TargetLowering::DAGCombinerInfo &DCI) {
28922   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
28923     return SDValue();
28924 
28925   SDLoc DL(ML);
28926   EVT VT = ML->getValueType(0);
28927 
28928   // If we are loading the first and last elements of a vector, it is safe and
28929   // always faster to load the whole vector. Replace the masked load with a
28930   // vector load and select.
28931   unsigned NumElts = VT.getVectorNumElements();
28932   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
28933   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
28934   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
28935   if (LoadFirstElt && LoadLastElt) {
28936     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28937                                 ML->getMemOperand());
28938     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
28939     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
28940   }
28941 
28942   // Convert a masked load with a constant mask into a masked load and a select.
28943   // This allows the select operation to use a faster kind of select instruction
28944   // (for example, vblendvps -> vblendps).
28945 
28946   // Don't try this if the pass-through operand is already undefined. That would
28947   // cause an infinite loop because that's what we're about to create.
28948   if (ML->getSrc0().isUndef())
28949     return SDValue();
28950 
28951   // The new masked load has an undef pass-through operand. The select uses the
28952   // original pass-through operand.
28953   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28954                                     ML->getMask(), DAG.getUNDEF(VT),
28955                                     ML->getMemoryVT(), ML->getMemOperand(),
28956                                     ML->getExtensionType());
28957   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
28958 
28959   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
28960 }
28961 
combineMaskedLoad(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)28962 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
28963                                  TargetLowering::DAGCombinerInfo &DCI,
28964                                  const X86Subtarget &Subtarget) {
28965   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
28966   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
28967     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
28968       return ScalarLoad;
28969     // TODO: Do some AVX512 subsets benefit from this transform?
28970     if (!Subtarget.hasAVX512())
28971       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
28972         return Blend;
28973   }
28974 
28975   if (Mld->getExtensionType() != ISD::SEXTLOAD)
28976     return SDValue();
28977 
28978   // Resolve extending loads.
28979   EVT VT = Mld->getValueType(0);
28980   unsigned NumElems = VT.getVectorNumElements();
28981   EVT LdVT = Mld->getMemoryVT();
28982   SDLoc dl(Mld);
28983 
28984   assert(LdVT != VT && "Cannot extend to the same type");
28985   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
28986   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
28987   // From/To sizes and ElemCount must be pow of two.
28988   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
28989     "Unexpected size for extending masked load");
28990 
28991   unsigned SizeRatio  = ToSz / FromSz;
28992   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
28993 
28994   // Create a type on which we perform the shuffle.
28995   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
28996           LdVT.getScalarType(), NumElems*SizeRatio);
28997   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
28998 
28999   // Convert Src0 value.
29000   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
29001   if (!Mld->getSrc0().isUndef()) {
29002     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29003     for (unsigned i = 0; i != NumElems; ++i)
29004       ShuffleVec[i] = i * SizeRatio;
29005 
29006     // Can't shuffle using an illegal type.
29007     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29008            "WideVecVT should be legal");
29009     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
29010                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
29011   }
29012   // Prepare the new mask.
29013   SDValue NewMask;
29014   SDValue Mask = Mld->getMask();
29015   if (Mask.getValueType() == VT) {
29016     // Mask and original value have the same type.
29017     NewMask = DAG.getBitcast(WideVecVT, Mask);
29018     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29019     for (unsigned i = 0; i != NumElems; ++i)
29020       ShuffleVec[i] = i * SizeRatio;
29021     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
29022       ShuffleVec[i] = NumElems * SizeRatio;
29023     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29024                                    DAG.getConstant(0, dl, WideVecVT),
29025                                    ShuffleVec);
29026   } else {
29027     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29028     unsigned WidenNumElts = NumElems*SizeRatio;
29029     unsigned MaskNumElts = VT.getVectorNumElements();
29030     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
29031                                      WidenNumElts);
29032 
29033     unsigned NumConcat = WidenNumElts / MaskNumElts;
29034     SmallVector<SDValue, 16> Ops(NumConcat);
29035     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29036     Ops[0] = Mask;
29037     for (unsigned i = 1; i != NumConcat; ++i)
29038       Ops[i] = ZeroVal;
29039 
29040     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29041   }
29042 
29043   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
29044                                      Mld->getBasePtr(), NewMask, WideSrc0,
29045                                      Mld->getMemoryVT(), Mld->getMemOperand(),
29046                                      ISD::NON_EXTLOAD);
29047   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
29048   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
29049 }
29050 
29051 /// If exactly one element of the mask is set for a non-truncating masked store,
29052 /// it is a vector extract and scalar store.
29053 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
29054 /// mask have already been optimized in IR, so we don't bother with those here.
reduceMaskedStoreToScalarStore(MaskedStoreSDNode * MS,SelectionDAG & DAG)29055 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
29056                                               SelectionDAG &DAG) {
29057   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
29058   // However, some target hooks may need to be added to know when the transform
29059   // is profitable. Endianness would also have to be considered.
29060 
29061   SDValue Addr, VecIndex;
29062   unsigned Alignment;
29063   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
29064     return SDValue();
29065 
29066   // Extract the one scalar element that is actually being stored.
29067   SDLoc DL(MS);
29068   EVT VT = MS->getValue().getValueType();
29069   EVT EltVT = VT.getVectorElementType();
29070   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
29071                                 MS->getValue(), VecIndex);
29072 
29073   // Store that element at the appropriate offset from the base pointer.
29074   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
29075                       MS->isVolatile(), MS->isNonTemporal(), Alignment);
29076 }
29077 
combineMaskedStore(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29078 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
29079                                   const X86Subtarget &Subtarget) {
29080   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
29081   if (!Mst->isTruncatingStore())
29082     return reduceMaskedStoreToScalarStore(Mst, DAG);
29083 
29084   // Resolve truncating stores.
29085   EVT VT = Mst->getValue().getValueType();
29086   unsigned NumElems = VT.getVectorNumElements();
29087   EVT StVT = Mst->getMemoryVT();
29088   SDLoc dl(Mst);
29089 
29090   assert(StVT != VT && "Cannot truncate to the same type");
29091   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29092   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29093 
29094   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29095 
29096   // The truncating store is legal in some cases. For example
29097   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29098   // are designated for truncate store.
29099   // In this case we don't need any further transformations.
29100   if (TLI.isTruncStoreLegal(VT, StVT))
29101     return SDValue();
29102 
29103   // From/To sizes and ElemCount must be pow of two.
29104   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
29105     "Unexpected size for truncating masked store");
29106   // We are going to use the original vector elt for storing.
29107   // Accumulated smaller vector elements must be a multiple of the store size.
29108   assert (((NumElems * FromSz) % ToSz) == 0 &&
29109           "Unexpected ratio for truncating masked store");
29110 
29111   unsigned SizeRatio  = FromSz / ToSz;
29112   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29113 
29114   // Create a type on which we perform the shuffle.
29115   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29116           StVT.getScalarType(), NumElems*SizeRatio);
29117 
29118   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29119 
29120   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
29121   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29122   for (unsigned i = 0; i != NumElems; ++i)
29123     ShuffleVec[i] = i * SizeRatio;
29124 
29125   // Can't shuffle using an illegal type.
29126   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29127          "WideVecVT should be legal");
29128 
29129   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29130                                               DAG.getUNDEF(WideVecVT),
29131                                               ShuffleVec);
29132 
29133   SDValue NewMask;
29134   SDValue Mask = Mst->getMask();
29135   if (Mask.getValueType() == VT) {
29136     // Mask and original value have the same type.
29137     NewMask = DAG.getBitcast(WideVecVT, Mask);
29138     for (unsigned i = 0; i != NumElems; ++i)
29139       ShuffleVec[i] = i * SizeRatio;
29140     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
29141       ShuffleVec[i] = NumElems*SizeRatio;
29142     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29143                                    DAG.getConstant(0, dl, WideVecVT),
29144                                    ShuffleVec);
29145   } else {
29146     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29147     unsigned WidenNumElts = NumElems*SizeRatio;
29148     unsigned MaskNumElts = VT.getVectorNumElements();
29149     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
29150                                      WidenNumElts);
29151 
29152     unsigned NumConcat = WidenNumElts / MaskNumElts;
29153     SmallVector<SDValue, 16> Ops(NumConcat);
29154     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29155     Ops[0] = Mask;
29156     for (unsigned i = 1; i != NumConcat; ++i)
29157       Ops[i] = ZeroVal;
29158 
29159     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29160   }
29161 
29162   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
29163                             Mst->getBasePtr(), NewMask, StVT,
29164                             Mst->getMemOperand(), false);
29165 }
29166 
combineStore(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29167 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
29168                             const X86Subtarget &Subtarget) {
29169   StoreSDNode *St = cast<StoreSDNode>(N);
29170   EVT VT = St->getValue().getValueType();
29171   EVT StVT = St->getMemoryVT();
29172   SDLoc dl(St);
29173   SDValue StoredVal = St->getOperand(1);
29174   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29175 
29176   // If we are saving a concatenation of two XMM registers and 32-byte stores
29177   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
29178   bool Fast;
29179   unsigned AddressSpace = St->getAddressSpace();
29180   unsigned Alignment = St->getAlignment();
29181   if (VT.is256BitVector() && StVT == VT &&
29182       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
29183                              AddressSpace, Alignment, &Fast) &&
29184       !Fast) {
29185     unsigned NumElems = VT.getVectorNumElements();
29186     if (NumElems < 2)
29187       return SDValue();
29188 
29189     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
29190     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
29191 
29192     SDValue Ptr0 = St->getBasePtr();
29193     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
29194 
29195     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
29196                                St->getPointerInfo(), St->isVolatile(),
29197                                St->isNonTemporal(), Alignment);
29198     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
29199                                St->getPointerInfo(), St->isVolatile(),
29200                                St->isNonTemporal(),
29201                                std::min(16U, Alignment));
29202     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
29203   }
29204 
29205   // Optimize trunc store (of multiple scalars) to shuffle and store.
29206   // First, pack all of the elements in one place. Next, store to memory
29207   // in fewer chunks.
29208   if (St->isTruncatingStore() && VT.isVector()) {
29209     // Check if we can detect an AVG pattern from the truncation. If yes,
29210     // replace the trunc store by a normal store with the result of X86ISD::AVG
29211     // instruction.
29212     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
29213                                        Subtarget, dl))
29214       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
29215                           St->getPointerInfo(), St->isVolatile(),
29216                           St->isNonTemporal(), St->getAlignment());
29217 
29218     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29219     unsigned NumElems = VT.getVectorNumElements();
29220     assert(StVT != VT && "Cannot truncate to the same type");
29221     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29222     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29223 
29224     // The truncating store is legal in some cases. For example
29225     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29226     // are designated for truncate store.
29227     // In this case we don't need any further transformations.
29228     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
29229       return SDValue();
29230 
29231     // From, To sizes and ElemCount must be pow of two
29232     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
29233     // We are going to use the original vector elt for storing.
29234     // Accumulated smaller vector elements must be a multiple of the store size.
29235     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
29236 
29237     unsigned SizeRatio  = FromSz / ToSz;
29238 
29239     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29240 
29241     // Create a type on which we perform the shuffle
29242     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29243             StVT.getScalarType(), NumElems*SizeRatio);
29244 
29245     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29246 
29247     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
29248     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
29249     for (unsigned i = 0; i != NumElems; ++i)
29250       ShuffleVec[i] = i * SizeRatio;
29251 
29252     // Can't shuffle using an illegal type.
29253     if (!TLI.isTypeLegal(WideVecVT))
29254       return SDValue();
29255 
29256     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29257                                          DAG.getUNDEF(WideVecVT),
29258                                          ShuffleVec);
29259     // At this point all of the data is stored at the bottom of the
29260     // register. We now need to save it to mem.
29261 
29262     // Find the largest store unit
29263     MVT StoreType = MVT::i8;
29264     for (MVT Tp : MVT::integer_valuetypes()) {
29265       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
29266         StoreType = Tp;
29267     }
29268 
29269     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
29270     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
29271         (64 <= NumElems * ToSz))
29272       StoreType = MVT::f64;
29273 
29274     // Bitcast the original vector into a vector of store-size units
29275     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
29276             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
29277     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
29278     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
29279     SmallVector<SDValue, 8> Chains;
29280     SDValue Ptr = St->getBasePtr();
29281 
29282     // Perform one or more big stores into memory.
29283     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
29284       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
29285                                    StoreType, ShuffWide,
29286                                    DAG.getIntPtrConstant(i, dl));
29287       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
29288                                 St->getPointerInfo(), St->isVolatile(),
29289                                 St->isNonTemporal(), St->getAlignment());
29290       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
29291       Chains.push_back(Ch);
29292     }
29293 
29294     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
29295   }
29296 
29297   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
29298   // the FP state in cases where an emms may be missing.
29299   // A preferable solution to the general problem is to figure out the right
29300   // places to insert EMMS.  This qualifies as a quick hack.
29301 
29302   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
29303   if (VT.getSizeInBits() != 64)
29304     return SDValue();
29305 
29306   const Function *F = DAG.getMachineFunction().getFunction();
29307   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
29308   bool F64IsLegal =
29309       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
29310   if ((VT.isVector() ||
29311        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
29312       isa<LoadSDNode>(St->getValue()) &&
29313       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
29314       St->getChain().hasOneUse() && !St->isVolatile()) {
29315     SDNode* LdVal = St->getValue().getNode();
29316     LoadSDNode *Ld = nullptr;
29317     int TokenFactorIndex = -1;
29318     SmallVector<SDValue, 8> Ops;
29319     SDNode* ChainVal = St->getChain().getNode();
29320     // Must be a store of a load.  We currently handle two cases:  the load
29321     // is a direct child, and it's under an intervening TokenFactor.  It is
29322     // possible to dig deeper under nested TokenFactors.
29323     if (ChainVal == LdVal)
29324       Ld = cast<LoadSDNode>(St->getChain());
29325     else if (St->getValue().hasOneUse() &&
29326              ChainVal->getOpcode() == ISD::TokenFactor) {
29327       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
29328         if (ChainVal->getOperand(i).getNode() == LdVal) {
29329           TokenFactorIndex = i;
29330           Ld = cast<LoadSDNode>(St->getValue());
29331         } else
29332           Ops.push_back(ChainVal->getOperand(i));
29333       }
29334     }
29335 
29336     if (!Ld || !ISD::isNormalLoad(Ld))
29337       return SDValue();
29338 
29339     // If this is not the MMX case, i.e. we are just turning i64 load/store
29340     // into f64 load/store, avoid the transformation if there are multiple
29341     // uses of the loaded value.
29342     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
29343       return SDValue();
29344 
29345     SDLoc LdDL(Ld);
29346     SDLoc StDL(N);
29347     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
29348     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
29349     // pair instead.
29350     if (Subtarget.is64Bit() || F64IsLegal) {
29351       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
29352       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
29353                                   Ld->getPointerInfo(), Ld->isVolatile(),
29354                                   Ld->isNonTemporal(), Ld->isInvariant(),
29355                                   Ld->getAlignment());
29356       SDValue NewChain = NewLd.getValue(1);
29357       if (TokenFactorIndex >= 0) {
29358         Ops.push_back(NewChain);
29359         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29360       }
29361       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
29362                           St->getPointerInfo(),
29363                           St->isVolatile(), St->isNonTemporal(),
29364                           St->getAlignment());
29365     }
29366 
29367     // Otherwise, lower to two pairs of 32-bit loads / stores.
29368     SDValue LoAddr = Ld->getBasePtr();
29369     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
29370 
29371     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
29372                                Ld->getPointerInfo(),
29373                                Ld->isVolatile(), Ld->isNonTemporal(),
29374                                Ld->isInvariant(), Ld->getAlignment());
29375     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
29376                                Ld->getPointerInfo().getWithOffset(4),
29377                                Ld->isVolatile(), Ld->isNonTemporal(),
29378                                Ld->isInvariant(),
29379                                MinAlign(Ld->getAlignment(), 4));
29380 
29381     SDValue NewChain = LoLd.getValue(1);
29382     if (TokenFactorIndex >= 0) {
29383       Ops.push_back(LoLd);
29384       Ops.push_back(HiLd);
29385       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29386     }
29387 
29388     LoAddr = St->getBasePtr();
29389     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
29390 
29391     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
29392                                 St->getPointerInfo(),
29393                                 St->isVolatile(), St->isNonTemporal(),
29394                                 St->getAlignment());
29395     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
29396                                 St->getPointerInfo().getWithOffset(4),
29397                                 St->isVolatile(),
29398                                 St->isNonTemporal(),
29399                                 MinAlign(St->getAlignment(), 4));
29400     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
29401   }
29402 
29403   // This is similar to the above case, but here we handle a scalar 64-bit
29404   // integer store that is extracted from a vector on a 32-bit target.
29405   // If we have SSE2, then we can treat it like a floating-point double
29406   // to get past legalization. The execution dependencies fixup pass will
29407   // choose the optimal machine instruction for the store if this really is
29408   // an integer or v2f32 rather than an f64.
29409   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
29410       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
29411     SDValue OldExtract = St->getOperand(1);
29412     SDValue ExtOp0 = OldExtract.getOperand(0);
29413     unsigned VecSize = ExtOp0.getValueSizeInBits();
29414     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
29415     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
29416     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
29417                                      BitCast, OldExtract.getOperand(1));
29418     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
29419                         St->getPointerInfo(), St->isVolatile(),
29420                         St->isNonTemporal(), St->getAlignment());
29421   }
29422 
29423   return SDValue();
29424 }
29425 
29426 /// Return 'true' if this vector operation is "horizontal"
29427 /// and return the operands for the horizontal operation in LHS and RHS.  A
29428 /// horizontal operation performs the binary operation on successive elements
29429 /// of its first operand, then on successive elements of its second operand,
29430 /// returning the resulting values in a vector.  For example, if
29431 ///   A = < float a0, float a1, float a2, float a3 >
29432 /// and
29433 ///   B = < float b0, float b1, float b2, float b3 >
29434 /// then the result of doing a horizontal operation on A and B is
29435 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
29436 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
29437 /// A horizontal-op B, for some already available A and B, and if so then LHS is
29438 /// set to A, RHS to B, and the routine returns 'true'.
29439 /// Note that the binary operation should have the property that if one of the
29440 /// operands is UNDEF then the result is UNDEF.
isHorizontalBinOp(SDValue & LHS,SDValue & RHS,bool IsCommutative)29441 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
29442   // Look for the following pattern: if
29443   //   A = < float a0, float a1, float a2, float a3 >
29444   //   B = < float b0, float b1, float b2, float b3 >
29445   // and
29446   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
29447   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
29448   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
29449   // which is A horizontal-op B.
29450 
29451   // At least one of the operands should be a vector shuffle.
29452   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
29453       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
29454     return false;
29455 
29456   MVT VT = LHS.getSimpleValueType();
29457 
29458   assert((VT.is128BitVector() || VT.is256BitVector()) &&
29459          "Unsupported vector type for horizontal add/sub");
29460 
29461   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
29462   // operate independently on 128-bit lanes.
29463   unsigned NumElts = VT.getVectorNumElements();
29464   unsigned NumLanes = VT.getSizeInBits()/128;
29465   unsigned NumLaneElts = NumElts / NumLanes;
29466   assert((NumLaneElts % 2 == 0) &&
29467          "Vector type should have an even number of elements in each lane");
29468   unsigned HalfLaneElts = NumLaneElts/2;
29469 
29470   // View LHS in the form
29471   //   LHS = VECTOR_SHUFFLE A, B, LMask
29472   // If LHS is not a shuffle then pretend it is the shuffle
29473   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
29474   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
29475   // type VT.
29476   SDValue A, B;
29477   SmallVector<int, 16> LMask(NumElts);
29478   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29479     if (!LHS.getOperand(0).isUndef())
29480       A = LHS.getOperand(0);
29481     if (!LHS.getOperand(1).isUndef())
29482       B = LHS.getOperand(1);
29483     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
29484     std::copy(Mask.begin(), Mask.end(), LMask.begin());
29485   } else {
29486     if (!LHS.isUndef())
29487       A = LHS;
29488     for (unsigned i = 0; i != NumElts; ++i)
29489       LMask[i] = i;
29490   }
29491 
29492   // Likewise, view RHS in the form
29493   //   RHS = VECTOR_SHUFFLE C, D, RMask
29494   SDValue C, D;
29495   SmallVector<int, 16> RMask(NumElts);
29496   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29497     if (!RHS.getOperand(0).isUndef())
29498       C = RHS.getOperand(0);
29499     if (!RHS.getOperand(1).isUndef())
29500       D = RHS.getOperand(1);
29501     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
29502     std::copy(Mask.begin(), Mask.end(), RMask.begin());
29503   } else {
29504     if (!RHS.isUndef())
29505       C = RHS;
29506     for (unsigned i = 0; i != NumElts; ++i)
29507       RMask[i] = i;
29508   }
29509 
29510   // Check that the shuffles are both shuffling the same vectors.
29511   if (!(A == C && B == D) && !(A == D && B == C))
29512     return false;
29513 
29514   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
29515   if (!A.getNode() && !B.getNode())
29516     return false;
29517 
29518   // If A and B occur in reverse order in RHS, then "swap" them (which means
29519   // rewriting the mask).
29520   if (A != C)
29521     ShuffleVectorSDNode::commuteMask(RMask);
29522 
29523   // At this point LHS and RHS are equivalent to
29524   //   LHS = VECTOR_SHUFFLE A, B, LMask
29525   //   RHS = VECTOR_SHUFFLE A, B, RMask
29526   // Check that the masks correspond to performing a horizontal operation.
29527   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
29528     for (unsigned i = 0; i != NumLaneElts; ++i) {
29529       int LIdx = LMask[i+l], RIdx = RMask[i+l];
29530 
29531       // Ignore any UNDEF components.
29532       if (LIdx < 0 || RIdx < 0 ||
29533           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
29534           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
29535         continue;
29536 
29537       // Check that successive elements are being operated on.  If not, this is
29538       // not a horizontal operation.
29539       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
29540       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
29541       if (!(LIdx == Index && RIdx == Index + 1) &&
29542           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
29543         return false;
29544     }
29545   }
29546 
29547   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
29548   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
29549   return true;
29550 }
29551 
29552 /// Do target-specific dag combines on floating-point adds/subs.
combineFaddFsub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29553 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
29554                                const X86Subtarget &Subtarget) {
29555   EVT VT = N->getValueType(0);
29556   SDValue LHS = N->getOperand(0);
29557   SDValue RHS = N->getOperand(1);
29558   bool IsFadd = N->getOpcode() == ISD::FADD;
29559   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
29560 
29561   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
29562   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
29563        (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
29564       isHorizontalBinOp(LHS, RHS, IsFadd)) {
29565     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
29566     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
29567   }
29568   return SDValue();
29569 }
29570 
29571 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
29572 static SDValue
combineVectorTruncationWithPACKUS(SDNode * N,SelectionDAG & DAG,SmallVector<SDValue,8> & Regs)29573 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
29574                                   SmallVector<SDValue, 8> &Regs) {
29575   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
29576                              Regs[0].getValueType() == MVT::v2i64));
29577   EVT OutVT = N->getValueType(0);
29578   EVT OutSVT = OutVT.getVectorElementType();
29579   EVT InVT = Regs[0].getValueType();
29580   EVT InSVT = InVT.getVectorElementType();
29581   SDLoc DL(N);
29582 
29583   // First, use mask to unset all bits that won't appear in the result.
29584   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
29585          "OutSVT can only be either i8 or i16.");
29586   APInt Mask =
29587       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
29588   SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
29589   for (auto &Reg : Regs)
29590     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
29591 
29592   MVT UnpackedVT, PackedVT;
29593   if (OutSVT == MVT::i8) {
29594     UnpackedVT = MVT::v8i16;
29595     PackedVT = MVT::v16i8;
29596   } else {
29597     UnpackedVT = MVT::v4i32;
29598     PackedVT = MVT::v8i16;
29599   }
29600 
29601   // In each iteration, truncate the type by a half size.
29602   auto RegNum = Regs.size();
29603   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
29604        j < e; j *= 2, RegNum /= 2) {
29605     for (unsigned i = 0; i < RegNum; i++)
29606       Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
29607     for (unsigned i = 0; i < RegNum / 2; i++)
29608       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
29609                             Regs[i * 2 + 1]);
29610   }
29611 
29612   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
29613   // then extract a subvector as the result since v8i8 is not a legal type.
29614   if (OutVT == MVT::v8i8) {
29615     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
29616     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
29617                           DAG.getIntPtrConstant(0, DL));
29618     return Regs[0];
29619   } else if (RegNum > 1) {
29620     Regs.resize(RegNum);
29621     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29622   } else
29623     return Regs[0];
29624 }
29625 
29626 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
29627 static SDValue
combineVectorTruncationWithPACKSS(SDNode * N,SelectionDAG & DAG,SmallVector<SDValue,8> & Regs)29628 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
29629                                   SmallVector<SDValue, 8> &Regs) {
29630   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
29631   EVT OutVT = N->getValueType(0);
29632   SDLoc DL(N);
29633 
29634   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
29635   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
29636   for (auto &Reg : Regs) {
29637     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29638     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29639   }
29640 
29641   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
29642     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
29643                           Regs[i * 2 + 1]);
29644 
29645   if (Regs.size() > 2) {
29646     Regs.resize(Regs.size() / 2);
29647     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29648   } else
29649     return Regs[0];
29650 }
29651 
29652 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
29653 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
29654 /// legalization the truncation will be translated into a BUILD_VECTOR with each
29655 /// element that is extracted from a vector and then truncated, and it is
29656 /// diffcult to do this optimization based on them.
combineVectorTruncation(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29657 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
29658                                        const X86Subtarget &Subtarget) {
29659   EVT OutVT = N->getValueType(0);
29660   if (!OutVT.isVector())
29661     return SDValue();
29662 
29663   SDValue In = N->getOperand(0);
29664   if (!In.getValueType().isSimple())
29665     return SDValue();
29666 
29667   EVT InVT = In.getValueType();
29668   unsigned NumElems = OutVT.getVectorNumElements();
29669 
29670   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
29671   // SSE2, and we need to take care of it specially.
29672   // AVX512 provides vpmovdb.
29673   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
29674     return SDValue();
29675 
29676   EVT OutSVT = OutVT.getVectorElementType();
29677   EVT InSVT = InVT.getVectorElementType();
29678   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
29679         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
29680         NumElems >= 8))
29681     return SDValue();
29682 
29683   // SSSE3's pshufb results in less instructions in the cases below.
29684   if (Subtarget.hasSSSE3() && NumElems == 8 &&
29685       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
29686        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
29687     return SDValue();
29688 
29689   SDLoc DL(N);
29690 
29691   // Split a long vector into vectors of legal type.
29692   unsigned RegNum = InVT.getSizeInBits() / 128;
29693   SmallVector<SDValue, 8> SubVec(RegNum);
29694   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
29695   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
29696 
29697   for (unsigned i = 0; i < RegNum; i++)
29698     SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
29699                             DAG.getIntPtrConstant(i * NumSubRegElts, DL));
29700 
29701   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
29702   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
29703   // truncate 2 x v4i32 to v8i16.
29704   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
29705     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
29706   else if (InSVT == MVT::i32)
29707     return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
29708   else
29709     return SDValue();
29710 }
29711 
combineTruncate(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29712 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
29713                                const X86Subtarget &Subtarget) {
29714   EVT VT = N->getValueType(0);
29715   SDValue Src = N->getOperand(0);
29716   SDLoc DL(N);
29717 
29718   // Try to detect AVG pattern first.
29719   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
29720     return Avg;
29721 
29722   // The bitcast source is a direct mmx result.
29723   // Detect bitcasts between i32 to x86mmx
29724   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
29725     SDValue BCSrc = Src.getOperand(0);
29726     if (BCSrc.getValueType() == MVT::x86mmx)
29727       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
29728   }
29729 
29730   return combineVectorTruncation(N, DAG, Subtarget);
29731 }
29732 
29733 /// Do target-specific dag combines on floating point negations.
combineFneg(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29734 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
29735                            const X86Subtarget &Subtarget) {
29736   EVT VT = N->getValueType(0);
29737   EVT SVT = VT.getScalarType();
29738   SDValue Arg = N->getOperand(0);
29739   SDLoc DL(N);
29740 
29741   // Let legalize expand this if it isn't a legal type yet.
29742   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29743     return SDValue();
29744 
29745   // If we're negating a FMUL node on a target with FMA, then we can avoid the
29746   // use of a constant by performing (-0 - A*B) instead.
29747   // FIXME: Check rounding control flags as well once it becomes available.
29748   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
29749       Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
29750     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
29751     return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29752                        Arg.getOperand(1), Zero);
29753   }
29754 
29755   // If we're negating a FMA node, then we can adjust the
29756   // instruction to include the extra negation.
29757   if (Arg.hasOneUse()) {
29758     switch (Arg.getOpcode()) {
29759     case X86ISD::FMADD:
29760       return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29761                          Arg.getOperand(1), Arg.getOperand(2));
29762     case X86ISD::FMSUB:
29763       return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
29764                          Arg.getOperand(1), Arg.getOperand(2));
29765     case X86ISD::FNMADD:
29766       return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
29767                          Arg.getOperand(1), Arg.getOperand(2));
29768     case X86ISD::FNMSUB:
29769       return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
29770                          Arg.getOperand(1), Arg.getOperand(2));
29771     }
29772   }
29773   return SDValue();
29774 }
29775 
lowerX86FPLogicOp(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29776 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
29777                               const X86Subtarget &Subtarget) {
29778   EVT VT = N->getValueType(0);
29779   if (VT.is512BitVector() && !Subtarget.hasDQI()) {
29780     // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
29781     // These logic operations may be executed in the integer domain.
29782     SDLoc dl(N);
29783     MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
29784     MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
29785 
29786     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
29787     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
29788     unsigned IntOpcode = 0;
29789     switch (N->getOpcode()) {
29790       default: llvm_unreachable("Unexpected FP logic op");
29791       case X86ISD::FOR: IntOpcode = ISD::OR; break;
29792       case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
29793       case X86ISD::FAND: IntOpcode = ISD::AND; break;
29794       case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
29795     }
29796     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
29797     return DAG.getBitcast(VT, IntOp);
29798   }
29799   return SDValue();
29800 }
29801 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
combineFOr(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29802 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
29803                           const X86Subtarget &Subtarget) {
29804   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
29805 
29806   // F[X]OR(0.0, x) -> x
29807   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29808     if (C->getValueAPF().isPosZero())
29809       return N->getOperand(1);
29810 
29811   // F[X]OR(x, 0.0) -> x
29812   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29813     if (C->getValueAPF().isPosZero())
29814       return N->getOperand(0);
29815 
29816   return lowerX86FPLogicOp(N, DAG, Subtarget);
29817 }
29818 
29819 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
combineFMinFMax(SDNode * N,SelectionDAG & DAG)29820 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
29821   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
29822 
29823   // Only perform optimizations if UnsafeMath is used.
29824   if (!DAG.getTarget().Options.UnsafeFPMath)
29825     return SDValue();
29826 
29827   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
29828   // into FMINC and FMAXC, which are Commutative operations.
29829   unsigned NewOp = 0;
29830   switch (N->getOpcode()) {
29831     default: llvm_unreachable("unknown opcode");
29832     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
29833     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
29834   }
29835 
29836   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
29837                      N->getOperand(0), N->getOperand(1));
29838 }
29839 
combineFMinNumFMaxNum(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29840 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
29841                                      const X86Subtarget &Subtarget) {
29842   if (Subtarget.useSoftFloat())
29843     return SDValue();
29844 
29845   // TODO: Check for global or instruction-level "nnan". In that case, we
29846   //       should be able to lower to FMAX/FMIN alone.
29847   // TODO: If an operand is already known to be a NaN or not a NaN, this
29848   //       should be an optional swap and FMAX/FMIN.
29849 
29850   EVT VT = N->getValueType(0);
29851   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
29852         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
29853         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
29854     return SDValue();
29855 
29856   // This takes at least 3 instructions, so favor a library call when operating
29857   // on a scalar and minimizing code size.
29858   if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
29859     return SDValue();
29860 
29861   SDValue Op0 = N->getOperand(0);
29862   SDValue Op1 = N->getOperand(1);
29863   SDLoc DL(N);
29864   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
29865       DAG.getDataLayout(), *DAG.getContext(), VT);
29866 
29867   // There are 4 possibilities involving NaN inputs, and these are the required
29868   // outputs:
29869   //                   Op1
29870   //               Num     NaN
29871   //            ----------------
29872   //       Num  |  Max  |  Op0 |
29873   // Op0        ----------------
29874   //       NaN  |  Op1  |  NaN |
29875   //            ----------------
29876   //
29877   // The SSE FP max/min instructions were not designed for this case, but rather
29878   // to implement:
29879   //   Min = Op1 < Op0 ? Op1 : Op0
29880   //   Max = Op1 > Op0 ? Op1 : Op0
29881   //
29882   // So they always return Op0 if either input is a NaN. However, we can still
29883   // use those instructions for fmaxnum by selecting away a NaN input.
29884 
29885   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
29886   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
29887   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
29888   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
29889 
29890   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
29891   // are NaN, the NaN value of Op1 is the result.
29892   auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
29893   return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
29894 }
29895 
29896 /// Do target-specific dag combines on X86ISD::FAND nodes.
combineFAnd(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29897 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
29898                            const X86Subtarget &Subtarget) {
29899   // FAND(0.0, x) -> 0.0
29900   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29901     if (C->getValueAPF().isPosZero())
29902       return N->getOperand(0);
29903 
29904   // FAND(x, 0.0) -> 0.0
29905   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29906     if (C->getValueAPF().isPosZero())
29907       return N->getOperand(1);
29908 
29909   return lowerX86FPLogicOp(N, DAG, Subtarget);
29910 }
29911 
29912 /// Do target-specific dag combines on X86ISD::FANDN nodes
combineFAndn(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29913 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
29914                             const X86Subtarget &Subtarget) {
29915   // FANDN(0.0, x) -> x
29916   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29917     if (C->getValueAPF().isPosZero())
29918       return N->getOperand(1);
29919 
29920   // FANDN(x, 0.0) -> 0.0
29921   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29922     if (C->getValueAPF().isPosZero())
29923       return N->getOperand(1);
29924 
29925   return lowerX86FPLogicOp(N, DAG, Subtarget);
29926 }
29927 
combineBT(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI)29928 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
29929                          TargetLowering::DAGCombinerInfo &DCI) {
29930   // BT ignores high bits in the bit index operand.
29931   SDValue Op1 = N->getOperand(1);
29932   if (Op1.hasOneUse()) {
29933     unsigned BitWidth = Op1.getValueSizeInBits();
29934     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
29935     APInt KnownZero, KnownOne;
29936     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
29937                                           !DCI.isBeforeLegalizeOps());
29938     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29939     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
29940         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
29941       DCI.CommitTargetLoweringOpt(TLO);
29942   }
29943   return SDValue();
29944 }
29945 
combineVZextMovl(SDNode * N,SelectionDAG & DAG)29946 static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
29947   SDValue Op = peekThroughBitcasts(N->getOperand(0));
29948   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
29949   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
29950       VT.getVectorElementType().getSizeInBits() ==
29951       OpVT.getVectorElementType().getSizeInBits()) {
29952     return DAG.getBitcast(VT, Op);
29953   }
29954   return SDValue();
29955 }
29956 
combineSignExtendInReg(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)29957 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
29958                                       const X86Subtarget &Subtarget) {
29959   EVT VT = N->getValueType(0);
29960   if (!VT.isVector())
29961     return SDValue();
29962 
29963   SDValue N0 = N->getOperand(0);
29964   SDValue N1 = N->getOperand(1);
29965   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
29966   SDLoc dl(N);
29967 
29968   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
29969   // both SSE and AVX2 since there is no sign-extended shift right
29970   // operation on a vector with 64-bit elements.
29971   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
29972   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
29973   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
29974       N0.getOpcode() == ISD::SIGN_EXTEND)) {
29975     SDValue N00 = N0.getOperand(0);
29976 
29977     // EXTLOAD has a better solution on AVX2,
29978     // it may be replaced with X86ISD::VSEXT node.
29979     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
29980       if (!ISD::isNormalLoad(N00.getNode()))
29981         return SDValue();
29982 
29983     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
29984         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
29985                                   N00, N1);
29986       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
29987     }
29988   }
29989   return SDValue();
29990 }
29991 
29992 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
29993 /// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
29994 /// to combine math ops, use an LEA, or use a complex addressing mode. This can
29995 /// eliminate extend, add, and shift instructions.
promoteSextBeforeAddNSW(SDNode * Sext,SelectionDAG & DAG,const X86Subtarget & Subtarget)29996 static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
29997                                        const X86Subtarget &Subtarget) {
29998   // TODO: This should be valid for other integer types.
29999   EVT VT = Sext->getValueType(0);
30000   if (VT != MVT::i64)
30001     return SDValue();
30002 
30003   // We need an 'add nsw' feeding into the 'sext'.
30004   SDValue Add = Sext->getOperand(0);
30005   if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
30006     return SDValue();
30007 
30008   // Having a constant operand to the 'add' ensures that we are not increasing
30009   // the instruction count because the constant is extended for free below.
30010   // A constant operand can also become the displacement field of an LEA.
30011   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
30012   if (!AddOp1)
30013     return SDValue();
30014 
30015   // Don't make the 'add' bigger if there's no hope of combining it with some
30016   // other 'add' or 'shl' instruction.
30017   // TODO: It may be profitable to generate simpler LEA instructions in place
30018   // of single 'add' instructions, but the cost model for selecting an LEA
30019   // currently has a high threshold.
30020   bool HasLEAPotential = false;
30021   for (auto *User : Sext->uses()) {
30022     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
30023       HasLEAPotential = true;
30024       break;
30025     }
30026   }
30027   if (!HasLEAPotential)
30028     return SDValue();
30029 
30030   // Everything looks good, so pull the 'sext' ahead of the 'add'.
30031   int64_t AddConstant = AddOp1->getSExtValue();
30032   SDValue AddOp0 = Add.getOperand(0);
30033   SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
30034   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
30035 
30036   // The wider add is guaranteed to not wrap because both operands are
30037   // sign-extended.
30038   SDNodeFlags Flags;
30039   Flags.setNoSignedWrap(true);
30040   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
30041 }
30042 
30043 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
30044 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
30045 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
30046 /// extends from AH (which we otherwise need to do contortions to access).
getDivRem8(SDNode * N,SelectionDAG & DAG)30047 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
30048   SDValue N0 = N->getOperand(0);
30049   auto OpcodeN = N->getOpcode();
30050   auto OpcodeN0 = N0.getOpcode();
30051   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
30052         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
30053     return SDValue();
30054 
30055   EVT VT = N->getValueType(0);
30056   EVT InVT = N0.getValueType();
30057   if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
30058     return SDValue();
30059 
30060   SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
30061   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
30062                                                : X86ISD::UDIVREM8_ZEXT_HREG;
30063   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
30064                           N0.getOperand(1));
30065   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
30066   return R.getValue(1);
30067 }
30068 
30069 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
30070 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
30071 /// with UNDEFs) of the input to vectors of the same size as the target type
30072 /// which then extends the lowest elements.
combineToExtendVectorInReg(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30073 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
30074                                           TargetLowering::DAGCombinerInfo &DCI,
30075                                           const X86Subtarget &Subtarget) {
30076   unsigned Opcode = N->getOpcode();
30077   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
30078     return SDValue();
30079   if (!DCI.isBeforeLegalizeOps())
30080     return SDValue();
30081   if (!Subtarget.hasSSE2())
30082     return SDValue();
30083 
30084   SDValue N0 = N->getOperand(0);
30085   EVT VT = N->getValueType(0);
30086   EVT SVT = VT.getScalarType();
30087   EVT InVT = N0.getValueType();
30088   EVT InSVT = InVT.getScalarType();
30089 
30090   // Input type must be a vector and we must be extending legal integer types.
30091   if (!VT.isVector())
30092     return SDValue();
30093   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
30094     return SDValue();
30095   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
30096     return SDValue();
30097 
30098   // On AVX2+ targets, if the input/output types are both legal then we will be
30099   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
30100   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
30101       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
30102     return SDValue();
30103 
30104   SDLoc DL(N);
30105 
30106   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
30107     EVT InVT = N.getValueType();
30108     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
30109                                  Size / InVT.getScalarSizeInBits());
30110     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
30111                                   DAG.getUNDEF(InVT));
30112     Opnds[0] = N;
30113     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
30114   };
30115 
30116   // If target-size is less than 128-bits, extend to a type that would extend
30117   // to 128 bits, extend that and extract the original target vector.
30118   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
30119     unsigned Scale = 128 / VT.getSizeInBits();
30120     EVT ExVT =
30121         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
30122     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
30123     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
30124     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
30125                        DAG.getIntPtrConstant(0, DL));
30126   }
30127 
30128   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
30129   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
30130   // Also use this if we don't have SSE41 to allow the legalizer do its job.
30131   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
30132       (VT.is256BitVector() && Subtarget.hasInt256())) {
30133     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
30134     return Opcode == ISD::SIGN_EXTEND
30135                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
30136                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
30137   }
30138 
30139   // On pre-AVX2 targets, split into 128-bit nodes of
30140   // ISD::*_EXTEND_VECTOR_INREG.
30141   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
30142     unsigned NumVecs = VT.getSizeInBits() / 128;
30143     unsigned NumSubElts = 128 / SVT.getSizeInBits();
30144     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
30145     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
30146 
30147     SmallVector<SDValue, 8> Opnds;
30148     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
30149       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
30150                                    DAG.getIntPtrConstant(Offset, DL));
30151       SrcVec = ExtendVecSize(DL, SrcVec, 128);
30152       SrcVec = Opcode == ISD::SIGN_EXTEND
30153                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
30154                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
30155       Opnds.push_back(SrcVec);
30156     }
30157     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
30158   }
30159 
30160   return SDValue();
30161 }
30162 
combineSext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30163 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
30164                            TargetLowering::DAGCombinerInfo &DCI,
30165                            const X86Subtarget &Subtarget) {
30166   SDValue N0 = N->getOperand(0);
30167   EVT VT = N->getValueType(0);
30168   EVT InVT = N0.getValueType();
30169   SDLoc DL(N);
30170 
30171   if (SDValue DivRem8 = getDivRem8(N, DAG))
30172     return DivRem8;
30173 
30174   if (!DCI.isBeforeLegalizeOps()) {
30175     if (InVT == MVT::i1) {
30176       SDValue Zero = DAG.getConstant(0, DL, VT);
30177       SDValue AllOnes =
30178           DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
30179       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
30180     }
30181     return SDValue();
30182   }
30183 
30184   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30185     return V;
30186 
30187   if (Subtarget.hasAVX() && VT.is256BitVector())
30188     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30189       return R;
30190 
30191   if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
30192     return NewAdd;
30193 
30194   return SDValue();
30195 }
30196 
combineFMA(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30197 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
30198                           const X86Subtarget &Subtarget) {
30199   SDLoc dl(N);
30200   EVT VT = N->getValueType(0);
30201 
30202   // Let legalize expand this if it isn't a legal type yet.
30203   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30204     return SDValue();
30205 
30206   EVT ScalarVT = VT.getScalarType();
30207   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
30208     return SDValue();
30209 
30210   SDValue A = N->getOperand(0);
30211   SDValue B = N->getOperand(1);
30212   SDValue C = N->getOperand(2);
30213 
30214   bool NegA = (A.getOpcode() == ISD::FNEG);
30215   bool NegB = (B.getOpcode() == ISD::FNEG);
30216   bool NegC = (C.getOpcode() == ISD::FNEG);
30217 
30218   // Negative multiplication when NegA xor NegB
30219   bool NegMul = (NegA != NegB);
30220   if (NegA)
30221     A = A.getOperand(0);
30222   if (NegB)
30223     B = B.getOperand(0);
30224   if (NegC)
30225     C = C.getOperand(0);
30226 
30227   unsigned Opcode;
30228   if (!NegMul)
30229     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
30230   else
30231     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
30232 
30233   return DAG.getNode(Opcode, dl, VT, A, B, C);
30234 }
30235 
combineZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30236 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
30237                            TargetLowering::DAGCombinerInfo &DCI,
30238                            const X86Subtarget &Subtarget) {
30239   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
30240   //           (and (i32 x86isd::setcc_carry), 1)
30241   // This eliminates the zext. This transformation is necessary because
30242   // ISD::SETCC is always legalized to i8.
30243   SDLoc dl(N);
30244   SDValue N0 = N->getOperand(0);
30245   EVT VT = N->getValueType(0);
30246 
30247   if (N0.getOpcode() == ISD::AND &&
30248       N0.hasOneUse() &&
30249       N0.getOperand(0).hasOneUse()) {
30250     SDValue N00 = N0.getOperand(0);
30251     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30252       if (!isOneConstant(N0.getOperand(1)))
30253         return SDValue();
30254       return DAG.getNode(ISD::AND, dl, VT,
30255                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30256                                      N00.getOperand(0), N00.getOperand(1)),
30257                          DAG.getConstant(1, dl, VT));
30258     }
30259   }
30260 
30261   if (N0.getOpcode() == ISD::TRUNCATE &&
30262       N0.hasOneUse() &&
30263       N0.getOperand(0).hasOneUse()) {
30264     SDValue N00 = N0.getOperand(0);
30265     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30266       return DAG.getNode(ISD::AND, dl, VT,
30267                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30268                                      N00.getOperand(0), N00.getOperand(1)),
30269                          DAG.getConstant(1, dl, VT));
30270     }
30271   }
30272 
30273   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30274     return V;
30275 
30276   if (VT.is256BitVector())
30277     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30278       return R;
30279 
30280   if (SDValue DivRem8 = getDivRem8(N, DAG))
30281     return DivRem8;
30282 
30283   return SDValue();
30284 }
30285 
30286 /// Optimize x == -y --> x+y == 0
30287 ///          x != -y --> x+y != 0
combineSetCC(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30288 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
30289                             const X86Subtarget &Subtarget) {
30290   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
30291   SDValue LHS = N->getOperand(0);
30292   SDValue RHS = N->getOperand(1);
30293   EVT VT = N->getValueType(0);
30294   SDLoc DL(N);
30295 
30296   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
30297     if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
30298       SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
30299                                  LHS.getOperand(1));
30300       return DAG.getSetCC(DL, N->getValueType(0), addV,
30301                           DAG.getConstant(0, DL, addV.getValueType()), CC);
30302     }
30303   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
30304     if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
30305       SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
30306                                  RHS.getOperand(1));
30307       return DAG.getSetCC(DL, N->getValueType(0), addV,
30308                           DAG.getConstant(0, DL, addV.getValueType()), CC);
30309     }
30310 
30311   if (VT.getScalarType() == MVT::i1 &&
30312       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
30313     bool IsSEXT0 =
30314         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30315         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30316     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30317 
30318     if (!IsSEXT0 || !IsVZero1) {
30319       // Swap the operands and update the condition code.
30320       std::swap(LHS, RHS);
30321       CC = ISD::getSetCCSwappedOperands(CC);
30322 
30323       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30324                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30325       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30326     }
30327 
30328     if (IsSEXT0 && IsVZero1) {
30329       assert(VT == LHS.getOperand(0).getValueType() &&
30330              "Uexpected operand type");
30331       if (CC == ISD::SETGT)
30332         return DAG.getConstant(0, DL, VT);
30333       if (CC == ISD::SETLE)
30334         return DAG.getConstant(1, DL, VT);
30335       if (CC == ISD::SETEQ || CC == ISD::SETGE)
30336         return DAG.getNOT(DL, LHS.getOperand(0), VT);
30337 
30338       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
30339              "Unexpected condition code!");
30340       return LHS.getOperand(0);
30341     }
30342   }
30343 
30344   // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
30345   // via legalization because v4i32 is not a legal type.
30346   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
30347     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
30348 
30349   return SDValue();
30350 }
30351 
combineGatherScatter(SDNode * N,SelectionDAG & DAG)30352 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
30353   SDLoc DL(N);
30354   // Gather and Scatter instructions use k-registers for masks. The type of
30355   // the masks is v*i1. So the mask will be truncated anyway.
30356   // The SIGN_EXTEND_INREG my be dropped.
30357   SDValue Mask = N->getOperand(2);
30358   if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
30359     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
30360     NewOps[2] = Mask.getOperand(0);
30361     DAG.UpdateNodeOperands(N, NewOps);
30362   }
30363   return SDValue();
30364 }
30365 
30366 // Helper function of performSETCCCombine. It is to materialize "setb reg"
30367 // as "sbb reg,reg", since it can be extended without zext and produces
30368 // an all-ones bit which is more useful than 0/1 in some cases.
MaterializeSETB(const SDLoc & DL,SDValue EFLAGS,SelectionDAG & DAG,MVT VT)30369 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
30370                                SelectionDAG &DAG, MVT VT) {
30371   if (VT == MVT::i8)
30372     return DAG.getNode(ISD::AND, DL, VT,
30373                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30374                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
30375                                    EFLAGS),
30376                        DAG.getConstant(1, DL, VT));
30377   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
30378   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
30379                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30380                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
30381                                  EFLAGS));
30382 }
30383 
30384 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
combineX86SetCC(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30385 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
30386                                TargetLowering::DAGCombinerInfo &DCI,
30387                                const X86Subtarget &Subtarget) {
30388   SDLoc DL(N);
30389   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
30390   SDValue EFLAGS = N->getOperand(1);
30391 
30392   if (CC == X86::COND_A) {
30393     // Try to convert COND_A into COND_B in an attempt to facilitate
30394     // materializing "setb reg".
30395     //
30396     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
30397     // cannot take an immediate as its first operand.
30398     //
30399     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
30400         EFLAGS.getValueType().isInteger() &&
30401         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
30402       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
30403                                    EFLAGS.getNode()->getVTList(),
30404                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
30405       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
30406       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
30407     }
30408   }
30409 
30410   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
30411   // a zext and produces an all-ones bit which is more useful than 0/1 in some
30412   // cases.
30413   if (CC == X86::COND_B)
30414     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
30415 
30416   // Try to simplify the EFLAGS and condition code operands.
30417   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30418     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30419     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
30420   }
30421 
30422   return SDValue();
30423 }
30424 
30425 /// Optimize branch condition evaluation.
combineBrCond(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30426 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
30427                              TargetLowering::DAGCombinerInfo &DCI,
30428                              const X86Subtarget &Subtarget) {
30429   SDLoc DL(N);
30430   SDValue EFLAGS = N->getOperand(3);
30431   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
30432 
30433   // Try to simplify the EFLAGS and condition code operands.
30434   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
30435   // RAUW them under us.
30436   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30437     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30438     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
30439                        N->getOperand(1), Cond, Flags);
30440   }
30441 
30442   return SDValue();
30443 }
30444 
combineVectorCompareAndMaskUnaryOp(SDNode * N,SelectionDAG & DAG)30445 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
30446                                                   SelectionDAG &DAG) {
30447   // Take advantage of vector comparisons producing 0 or -1 in each lane to
30448   // optimize away operation when it's from a constant.
30449   //
30450   // The general transformation is:
30451   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
30452   //       AND(VECTOR_CMP(x,y), constant2)
30453   //    constant2 = UNARYOP(constant)
30454 
30455   // Early exit if this isn't a vector operation, the operand of the
30456   // unary operation isn't a bitwise AND, or if the sizes of the operations
30457   // aren't the same.
30458   EVT VT = N->getValueType(0);
30459   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
30460       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
30461       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
30462     return SDValue();
30463 
30464   // Now check that the other operand of the AND is a constant. We could
30465   // make the transformation for non-constant splats as well, but it's unclear
30466   // that would be a benefit as it would not eliminate any operations, just
30467   // perform one more step in scalar code before moving to the vector unit.
30468   if (BuildVectorSDNode *BV =
30469           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
30470     // Bail out if the vector isn't a constant.
30471     if (!BV->isConstant())
30472       return SDValue();
30473 
30474     // Everything checks out. Build up the new and improved node.
30475     SDLoc DL(N);
30476     EVT IntVT = BV->getValueType(0);
30477     // Create a new constant of the appropriate type for the transformed
30478     // DAG.
30479     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
30480     // The AND node needs bitcasts to/from an integer vector type around it.
30481     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
30482     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
30483                                  N->getOperand(0)->getOperand(0), MaskConst);
30484     SDValue Res = DAG.getBitcast(VT, NewAnd);
30485     return Res;
30486   }
30487 
30488   return SDValue();
30489 }
30490 
combineUIntToFP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30491 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
30492                                const X86Subtarget &Subtarget) {
30493   SDValue Op0 = N->getOperand(0);
30494   EVT VT = N->getValueType(0);
30495   EVT InVT = Op0.getValueType();
30496   EVT InSVT = InVT.getScalarType();
30497   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30498 
30499   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
30500   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
30501   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30502     SDLoc dl(N);
30503     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30504                                  InVT.getVectorNumElements());
30505     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
30506 
30507     if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
30508       return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
30509 
30510     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30511   }
30512 
30513   return SDValue();
30514 }
30515 
combineSIntToFP(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30516 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
30517                                const X86Subtarget &Subtarget) {
30518   // First try to optimize away the conversion entirely when it's
30519   // conditionally from a constant. Vectors only.
30520   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
30521     return Res;
30522 
30523   // Now move on to more general possibilities.
30524   SDValue Op0 = N->getOperand(0);
30525   EVT VT = N->getValueType(0);
30526   EVT InVT = Op0.getValueType();
30527   EVT InSVT = InVT.getScalarType();
30528 
30529   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
30530   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
30531   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30532     SDLoc dl(N);
30533     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30534                                  InVT.getVectorNumElements());
30535     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
30536     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30537   }
30538 
30539   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
30540   // a 32-bit target where SSE doesn't support i64->FP operations.
30541   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
30542     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
30543     EVT LdVT = Ld->getValueType(0);
30544 
30545     // This transformation is not supported if the result type is f16 or f128.
30546     if (VT == MVT::f16 || VT == MVT::f128)
30547       return SDValue();
30548 
30549     if (!Ld->isVolatile() && !VT.isVector() &&
30550         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
30551         !Subtarget.is64Bit() && LdVT == MVT::i64) {
30552       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
30553           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
30554       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
30555       return FILDChain;
30556     }
30557   }
30558   return SDValue();
30559 }
30560 
30561 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
combineADC(SDNode * N,SelectionDAG & DAG,X86TargetLowering::DAGCombinerInfo & DCI)30562 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
30563                           X86TargetLowering::DAGCombinerInfo &DCI) {
30564   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
30565   // the result is either zero or one (depending on the input carry bit).
30566   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
30567   if (X86::isZeroNode(N->getOperand(0)) &&
30568       X86::isZeroNode(N->getOperand(1)) &&
30569       // We don't have a good way to replace an EFLAGS use, so only do this when
30570       // dead right now.
30571       SDValue(N, 1).use_empty()) {
30572     SDLoc DL(N);
30573     EVT VT = N->getValueType(0);
30574     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
30575     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
30576                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
30577                                            DAG.getConstant(X86::COND_B, DL,
30578                                                            MVT::i8),
30579                                            N->getOperand(2)),
30580                                DAG.getConstant(1, DL, VT));
30581     return DCI.CombineTo(N, Res1, CarryOut);
30582   }
30583 
30584   return SDValue();
30585 }
30586 
30587 /// fold (add Y, (sete  X, 0)) -> adc  0, Y
30588 ///      (add Y, (setne X, 0)) -> sbb -1, Y
30589 ///      (sub (sete  X, 0), Y) -> sbb  0, Y
30590 ///      (sub (setne X, 0), Y) -> adc -1, Y
OptimizeConditionalInDecrement(SDNode * N,SelectionDAG & DAG)30591 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
30592   SDLoc DL(N);
30593 
30594   // Look through ZExts.
30595   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
30596   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
30597     return SDValue();
30598 
30599   SDValue SetCC = Ext.getOperand(0);
30600   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
30601     return SDValue();
30602 
30603   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
30604   if (CC != X86::COND_E && CC != X86::COND_NE)
30605     return SDValue();
30606 
30607   SDValue Cmp = SetCC.getOperand(1);
30608   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
30609       !X86::isZeroNode(Cmp.getOperand(1)) ||
30610       !Cmp.getOperand(0).getValueType().isInteger())
30611     return SDValue();
30612 
30613   SDValue CmpOp0 = Cmp.getOperand(0);
30614   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
30615                                DAG.getConstant(1, DL, CmpOp0.getValueType()));
30616 
30617   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
30618   if (CC == X86::COND_NE)
30619     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
30620                        DL, OtherVal.getValueType(), OtherVal,
30621                        DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
30622                        NewCmp);
30623   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
30624                      DL, OtherVal.getValueType(), OtherVal,
30625                      DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
30626 }
30627 
detectSADPattern(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30628 static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
30629                                 const X86Subtarget &Subtarget) {
30630   SDLoc DL(N);
30631   EVT VT = N->getValueType(0);
30632   SDValue Op0 = N->getOperand(0);
30633   SDValue Op1 = N->getOperand(1);
30634 
30635   if (!VT.isVector() || !VT.isSimple() ||
30636       !(VT.getVectorElementType() == MVT::i32))
30637     return SDValue();
30638 
30639   unsigned RegSize = 128;
30640   if (Subtarget.hasBWI())
30641     RegSize = 512;
30642   else if (Subtarget.hasAVX2())
30643     RegSize = 256;
30644 
30645   // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
30646   if (VT.getSizeInBits() / 4 > RegSize)
30647     return SDValue();
30648 
30649   // Detect the following pattern:
30650   //
30651   // 1:    %2 = zext <N x i8> %0 to <N x i32>
30652   // 2:    %3 = zext <N x i8> %1 to <N x i32>
30653   // 3:    %4 = sub nsw <N x i32> %2, %3
30654   // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30655   // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
30656   // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30657   // 7:    %8 = add nsw <N x i32> %7, %vec.phi
30658   //
30659   // The last instruction must be a reduction add. The instructions 3-6 forms an
30660   // ABSDIFF pattern.
30661 
30662   // The two operands of reduction add are from PHI and a select-op as in line 7
30663   // above.
30664   SDValue SelectOp, Phi;
30665   if (Op0.getOpcode() == ISD::VSELECT) {
30666     SelectOp = Op0;
30667     Phi = Op1;
30668   } else if (Op1.getOpcode() == ISD::VSELECT) {
30669     SelectOp = Op1;
30670     Phi = Op0;
30671   } else
30672     return SDValue();
30673 
30674   // Check the condition of the select instruction is greater-than.
30675   SDValue SetCC = SelectOp->getOperand(0);
30676   if (SetCC.getOpcode() != ISD::SETCC)
30677     return SDValue();
30678   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30679   if (CC != ISD::SETGT)
30680     return SDValue();
30681 
30682   Op0 = SelectOp->getOperand(1);
30683   Op1 = SelectOp->getOperand(2);
30684 
30685   // The second operand of SelectOp Op1 is the negation of the first operand
30686   // Op0, which is implemented as 0 - Op0.
30687   if (!(Op1.getOpcode() == ISD::SUB &&
30688         ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
30689         Op1.getOperand(1) == Op0))
30690     return SDValue();
30691 
30692   // The first operand of SetCC is the first operand of SelectOp, which is the
30693   // difference between two input vectors.
30694   if (SetCC.getOperand(0) != Op0)
30695     return SDValue();
30696 
30697   // The second operand of > comparison can be either -1 or 0.
30698   if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30699         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30700     return SDValue();
30701 
30702   // The first operand of SelectOp is the difference between two input vectors.
30703   if (Op0.getOpcode() != ISD::SUB)
30704     return SDValue();
30705 
30706   Op1 = Op0.getOperand(1);
30707   Op0 = Op0.getOperand(0);
30708 
30709   // Check if the operands of the diff are zero-extended from vectors of i8.
30710   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30711       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30712       Op1.getOpcode() != ISD::ZERO_EXTEND ||
30713       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30714     return SDValue();
30715 
30716   // SAD pattern detected. Now build a SAD instruction and an addition for
30717   // reduction. Note that the number of elments of the result of SAD is less
30718   // than the number of elements of its input. Therefore, we could only update
30719   // part of elements in the reduction vector.
30720 
30721   // Legalize the type of the inputs of PSADBW.
30722   EVT InVT = Op0.getOperand(0).getValueType();
30723   if (InVT.getSizeInBits() <= 128)
30724     RegSize = 128;
30725   else if (InVT.getSizeInBits() <= 256)
30726     RegSize = 256;
30727 
30728   unsigned NumConcat = RegSize / InVT.getSizeInBits();
30729   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30730   Ops[0] = Op0.getOperand(0);
30731   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30732   Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30733   Ops[0] = Op1.getOperand(0);
30734   Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30735 
30736   // The output of PSADBW is a vector of i64.
30737   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30738   SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
30739 
30740   // We need to turn the vector of i64 into a vector of i32.
30741   // If the reduction vector is at least as wide as the psadbw result, just
30742   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
30743   // anyway.
30744   MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30745   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
30746     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
30747   else
30748     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
30749 
30750   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
30751     // Update part of elements of the reduction vector. This is done by first
30752     // extracting a sub-vector from it, updating this sub-vector, and inserting
30753     // it back.
30754     SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
30755                                  DAG.getIntPtrConstant(0, DL));
30756     SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
30757     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
30758                        DAG.getIntPtrConstant(0, DL));
30759   } else
30760     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
30761 }
30762 
combineAdd(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30763 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
30764                           const X86Subtarget &Subtarget) {
30765   const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
30766   if (Flags->hasVectorReduction()) {
30767     if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
30768       return Sad;
30769   }
30770   EVT VT = N->getValueType(0);
30771   SDValue Op0 = N->getOperand(0);
30772   SDValue Op1 = N->getOperand(1);
30773 
30774   // Try to synthesize horizontal adds from adds of shuffles.
30775   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30776        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30777       isHorizontalBinOp(Op0, Op1, true))
30778     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
30779 
30780   return OptimizeConditionalInDecrement(N, DAG);
30781 }
30782 
combineSub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30783 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
30784                           const X86Subtarget &Subtarget) {
30785   SDValue Op0 = N->getOperand(0);
30786   SDValue Op1 = N->getOperand(1);
30787 
30788   // X86 can't encode an immediate LHS of a sub. See if we can push the
30789   // negation into a preceding instruction.
30790   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
30791     // If the RHS of the sub is a XOR with one use and a constant, invert the
30792     // immediate. Then add one to the LHS of the sub so we can turn
30793     // X-Y -> X+~Y+1, saving one register.
30794     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
30795         isa<ConstantSDNode>(Op1.getOperand(1))) {
30796       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
30797       EVT VT = Op0.getValueType();
30798       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
30799                                    Op1.getOperand(0),
30800                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
30801       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
30802                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
30803     }
30804   }
30805 
30806   // Try to synthesize horizontal adds from adds of shuffles.
30807   EVT VT = N->getValueType(0);
30808   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30809        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30810       isHorizontalBinOp(Op0, Op1, true))
30811     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
30812 
30813   return OptimizeConditionalInDecrement(N, DAG);
30814 }
30815 
combineVZext(SDNode * N,SelectionDAG & DAG,TargetLowering::DAGCombinerInfo & DCI,const X86Subtarget & Subtarget)30816 static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
30817                             TargetLowering::DAGCombinerInfo &DCI,
30818                             const X86Subtarget &Subtarget) {
30819   SDLoc DL(N);
30820   MVT VT = N->getSimpleValueType(0);
30821   MVT SVT = VT.getVectorElementType();
30822   SDValue Op = N->getOperand(0);
30823   MVT OpVT = Op.getSimpleValueType();
30824   MVT OpEltVT = OpVT.getVectorElementType();
30825   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
30826 
30827   // Perform any constant folding.
30828   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
30829     SmallVector<SDValue, 4> Vals;
30830     for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30831       SDValue OpElt = Op.getOperand(i);
30832       if (OpElt.getOpcode() == ISD::UNDEF) {
30833         Vals.push_back(DAG.getUNDEF(SVT));
30834         continue;
30835       }
30836       APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
30837       assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
30838       Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
30839       Vals.push_back(DAG.getConstant(Cst, DL, SVT));
30840     }
30841     return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
30842   }
30843 
30844   // (vzext (bitcast (vzext (x)) -> (vzext x)
30845   SDValue V = peekThroughBitcasts(Op);
30846   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
30847     MVT InnerVT = V.getSimpleValueType();
30848     MVT InnerEltVT = InnerVT.getVectorElementType();
30849 
30850     // If the element sizes match exactly, we can just do one larger vzext. This
30851     // is always an exact type match as vzext operates on integer types.
30852     if (OpEltVT == InnerEltVT) {
30853       assert(OpVT == InnerVT && "Types must match for vzext!");
30854       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
30855     }
30856 
30857     // The only other way we can combine them is if only a single element of the
30858     // inner vzext is used in the input to the outer vzext.
30859     if (InnerEltVT.getSizeInBits() < InputBits)
30860       return SDValue();
30861 
30862     // In this case, the inner vzext is completely dead because we're going to
30863     // only look at bits inside of the low element. Just do the outer vzext on
30864     // a bitcast of the input to the inner.
30865     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
30866   }
30867 
30868   // Check if we can bypass extracting and re-inserting an element of an input
30869   // vector. Essentially:
30870   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
30871   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
30872       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
30873       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
30874     SDValue ExtractedV = V.getOperand(0);
30875     SDValue OrigV = ExtractedV.getOperand(0);
30876     if (isNullConstant(ExtractedV.getOperand(1))) {
30877         MVT OrigVT = OrigV.getSimpleValueType();
30878         // Extract a subvector if necessary...
30879         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
30880           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
30881           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
30882                                     OrigVT.getVectorNumElements() / Ratio);
30883           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
30884                               DAG.getIntPtrConstant(0, DL));
30885         }
30886         Op = DAG.getBitcast(OpVT, OrigV);
30887         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
30888       }
30889   }
30890 
30891   return SDValue();
30892 }
30893 
30894 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
combineLockSub(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30895 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
30896                                   const X86Subtarget &Subtarget) {
30897   SDValue Chain = N->getOperand(0);
30898   SDValue LHS = N->getOperand(1);
30899   SDValue RHS = N->getOperand(2);
30900   MVT VT = RHS.getSimpleValueType();
30901   SDLoc DL(N);
30902 
30903   auto *C = dyn_cast<ConstantSDNode>(RHS);
30904   if (!C || C->getZExtValue() != 1)
30905     return SDValue();
30906 
30907   RHS = DAG.getConstant(-1, DL, VT);
30908   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
30909   return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
30910                                  DAG.getVTList(MVT::i32, MVT::Other),
30911                                  {Chain, LHS, RHS}, VT, MMO);
30912 }
30913 
30914 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
combineTestM(SDNode * N,SelectionDAG & DAG)30915 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
30916   SDValue Op0 = N->getOperand(0);
30917   SDValue Op1 = N->getOperand(1);
30918 
30919   if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
30920     return SDValue();
30921 
30922   EVT VT = N->getValueType(0);
30923   SDLoc DL(N);
30924 
30925   return DAG.getNode(X86ISD::TESTM, DL, VT,
30926                      Op0->getOperand(0), Op0->getOperand(1));
30927 }
30928 
combineVectorCompare(SDNode * N,SelectionDAG & DAG,const X86Subtarget & Subtarget)30929 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
30930                                     const X86Subtarget &Subtarget) {
30931   MVT VT = N->getSimpleValueType(0);
30932   SDLoc DL(N);
30933 
30934   if (N->getOperand(0) == N->getOperand(1)) {
30935     if (N->getOpcode() == X86ISD::PCMPEQ)
30936       return getOnesVector(VT, Subtarget, DAG, DL);
30937     if (N->getOpcode() == X86ISD::PCMPGT)
30938       return getZeroVector(VT, Subtarget, DAG, DL);
30939   }
30940 
30941   return SDValue();
30942 }
30943 
30944 
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const30945 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
30946                                              DAGCombinerInfo &DCI) const {
30947   SelectionDAG &DAG = DCI.DAG;
30948   switch (N->getOpcode()) {
30949   default: break;
30950   case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
30951   case ISD::VSELECT:
30952   case ISD::SELECT:
30953   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
30954   case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
30955   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
30956   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
30957   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
30958   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
30959   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
30960   case ISD::SHL:
30961   case ISD::SRA:
30962   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
30963   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
30964   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
30965   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
30966   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
30967   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
30968   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
30969   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
30970   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
30971   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
30972   case ISD::FADD:
30973   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
30974   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
30975   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
30976   case X86ISD::FXOR:
30977   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
30978   case X86ISD::FMIN:
30979   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
30980   case ISD::FMINNUM:
30981   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
30982   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
30983   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
30984   case X86ISD::BT:          return combineBT(N, DAG, DCI);
30985   case X86ISD::VZEXT_MOVL:  return combineVZextMovl(N, DAG);
30986   case ISD::ANY_EXTEND:
30987   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
30988   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
30989   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
30990   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
30991   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
30992   case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
30993   case X86ISD::VZEXT:       return combineVZext(N, DAG, DCI, Subtarget);
30994   case X86ISD::SHUFP:       // Handle all target specific shuffles
30995   case X86ISD::INSERTPS:
30996   case X86ISD::PALIGNR:
30997   case X86ISD::VSHLDQ:
30998   case X86ISD::VSRLDQ:
30999   case X86ISD::BLENDI:
31000   case X86ISD::UNPCKH:
31001   case X86ISD::UNPCKL:
31002   case X86ISD::MOVHLPS:
31003   case X86ISD::MOVLHPS:
31004   case X86ISD::PSHUFB:
31005   case X86ISD::PSHUFD:
31006   case X86ISD::PSHUFHW:
31007   case X86ISD::PSHUFLW:
31008   case X86ISD::MOVSHDUP:
31009   case X86ISD::MOVSLDUP:
31010   case X86ISD::MOVDDUP:
31011   case X86ISD::MOVSS:
31012   case X86ISD::MOVSD:
31013   case X86ISD::VPPERM:
31014   case X86ISD::VPERMV:
31015   case X86ISD::VPERMV3:
31016   case X86ISD::VPERMIL2:
31017   case X86ISD::VPERMILPI:
31018   case X86ISD::VPERMILPV:
31019   case X86ISD::VPERM2X128:
31020   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
31021   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
31022   case ISD::MGATHER:
31023   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
31024   case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
31025   case X86ISD::TESTM:       return combineTestM(N, DAG);
31026   case X86ISD::PCMPEQ:
31027   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
31028   }
31029 
31030   return SDValue();
31031 }
31032 
31033 /// Return true if the target has native support for the specified value type
31034 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
31035 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
31036 /// some i16 instructions are slow.
isTypeDesirableForOp(unsigned Opc,EVT VT) const31037 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
31038   if (!isTypeLegal(VT))
31039     return false;
31040   if (VT != MVT::i16)
31041     return true;
31042 
31043   switch (Opc) {
31044   default:
31045     return true;
31046   case ISD::LOAD:
31047   case ISD::SIGN_EXTEND:
31048   case ISD::ZERO_EXTEND:
31049   case ISD::ANY_EXTEND:
31050   case ISD::SHL:
31051   case ISD::SRL:
31052   case ISD::SUB:
31053   case ISD::ADD:
31054   case ISD::MUL:
31055   case ISD::AND:
31056   case ISD::OR:
31057   case ISD::XOR:
31058     return false;
31059   }
31060 }
31061 
31062 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
31063 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
31064 /// we don't adjust the stack we clobber the first frame index.
31065 /// See X86InstrInfo::copyPhysReg.
hasCopyImplyingStackAdjustment(MachineFunction * MF) const31066 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
31067     MachineFunction *MF) const {
31068   const MachineRegisterInfo &MRI = MF->getRegInfo();
31069 
31070   return any_of(MRI.reg_instructions(X86::EFLAGS),
31071                 [](const MachineInstr &RI) { return RI.isCopy(); });
31072 }
31073 
31074 /// This method query the target whether it is beneficial for dag combiner to
31075 /// promote the specified node. If true, it should return the desired promotion
31076 /// type by reference.
IsDesirableToPromoteOp(SDValue Op,EVT & PVT) const31077 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
31078   EVT VT = Op.getValueType();
31079   if (VT != MVT::i16)
31080     return false;
31081 
31082   bool Promote = false;
31083   bool Commute = false;
31084   switch (Op.getOpcode()) {
31085   default: break;
31086   case ISD::SIGN_EXTEND:
31087   case ISD::ZERO_EXTEND:
31088   case ISD::ANY_EXTEND:
31089     Promote = true;
31090     break;
31091   case ISD::SHL:
31092   case ISD::SRL: {
31093     SDValue N0 = Op.getOperand(0);
31094     // Look out for (store (shl (load), x)).
31095     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
31096       return false;
31097     Promote = true;
31098     break;
31099   }
31100   case ISD::ADD:
31101   case ISD::MUL:
31102   case ISD::AND:
31103   case ISD::OR:
31104   case ISD::XOR:
31105     Commute = true;
31106     // fallthrough
31107   case ISD::SUB: {
31108     SDValue N0 = Op.getOperand(0);
31109     SDValue N1 = Op.getOperand(1);
31110     if (!Commute && MayFoldLoad(N1))
31111       return false;
31112     // Avoid disabling potential load folding opportunities.
31113     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
31114       return false;
31115     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
31116       return false;
31117     Promote = true;
31118   }
31119   }
31120 
31121   PVT = MVT::i32;
31122   return Promote;
31123 }
31124 
31125 //===----------------------------------------------------------------------===//
31126 //                           X86 Inline Assembly Support
31127 //===----------------------------------------------------------------------===//
31128 
31129 // Helper to match a string separated by whitespace.
matchAsm(StringRef S,ArrayRef<const char * > Pieces)31130 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
31131   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
31132 
31133   for (StringRef Piece : Pieces) {
31134     if (!S.startswith(Piece)) // Check if the piece matches.
31135       return false;
31136 
31137     S = S.substr(Piece.size());
31138     StringRef::size_type Pos = S.find_first_not_of(" \t");
31139     if (Pos == 0) // We matched a prefix.
31140       return false;
31141 
31142     S = S.substr(Pos);
31143   }
31144 
31145   return S.empty();
31146 }
31147 
clobbersFlagRegisters(const SmallVector<StringRef,4> & AsmPieces)31148 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
31149 
31150   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
31151     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
31152         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
31153         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
31154 
31155       if (AsmPieces.size() == 3)
31156         return true;
31157       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
31158         return true;
31159     }
31160   }
31161   return false;
31162 }
31163 
ExpandInlineAsm(CallInst * CI) const31164 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
31165   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
31166 
31167   const std::string &AsmStr = IA->getAsmString();
31168 
31169   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
31170   if (!Ty || Ty->getBitWidth() % 16 != 0)
31171     return false;
31172 
31173   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
31174   SmallVector<StringRef, 4> AsmPieces;
31175   SplitString(AsmStr, AsmPieces, ";\n");
31176 
31177   switch (AsmPieces.size()) {
31178   default: return false;
31179   case 1:
31180     // FIXME: this should verify that we are targeting a 486 or better.  If not,
31181     // we will turn this bswap into something that will be lowered to logical
31182     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
31183     // lower so don't worry about this.
31184     // bswap $0
31185     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
31186         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
31187         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
31188         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
31189         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
31190         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
31191       // No need to check constraints, nothing other than the equivalent of
31192       // "=r,0" would be valid here.
31193       return IntrinsicLowering::LowerToByteSwap(CI);
31194     }
31195 
31196     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
31197     if (CI->getType()->isIntegerTy(16) &&
31198         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31199         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
31200          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
31201       AsmPieces.clear();
31202       StringRef ConstraintsStr = IA->getConstraintString();
31203       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31204       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31205       if (clobbersFlagRegisters(AsmPieces))
31206         return IntrinsicLowering::LowerToByteSwap(CI);
31207     }
31208     break;
31209   case 3:
31210     if (CI->getType()->isIntegerTy(32) &&
31211         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31212         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
31213         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
31214         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
31215       AsmPieces.clear();
31216       StringRef ConstraintsStr = IA->getConstraintString();
31217       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31218       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31219       if (clobbersFlagRegisters(AsmPieces))
31220         return IntrinsicLowering::LowerToByteSwap(CI);
31221     }
31222 
31223     if (CI->getType()->isIntegerTy(64)) {
31224       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
31225       if (Constraints.size() >= 2 &&
31226           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
31227           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
31228         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
31229         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
31230             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
31231             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
31232           return IntrinsicLowering::LowerToByteSwap(CI);
31233       }
31234     }
31235     break;
31236   }
31237   return false;
31238 }
31239 
31240 /// Given a constraint letter, return the type of constraint for this target.
31241 X86TargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const31242 X86TargetLowering::getConstraintType(StringRef Constraint) const {
31243   if (Constraint.size() == 1) {
31244     switch (Constraint[0]) {
31245     case 'R':
31246     case 'q':
31247     case 'Q':
31248     case 'f':
31249     case 't':
31250     case 'u':
31251     case 'y':
31252     case 'x':
31253     case 'Y':
31254     case 'l':
31255       return C_RegisterClass;
31256     case 'a':
31257     case 'b':
31258     case 'c':
31259     case 'd':
31260     case 'S':
31261     case 'D':
31262     case 'A':
31263       return C_Register;
31264     case 'I':
31265     case 'J':
31266     case 'K':
31267     case 'L':
31268     case 'M':
31269     case 'N':
31270     case 'G':
31271     case 'C':
31272     case 'e':
31273     case 'Z':
31274       return C_Other;
31275     default:
31276       break;
31277     }
31278   }
31279   return TargetLowering::getConstraintType(Constraint);
31280 }
31281 
31282 /// Examine constraint type and operand type and determine a weight value.
31283 /// This object must already have been set up with the operand type
31284 /// and the current alternative constraint selected.
31285 TargetLowering::ConstraintWeight
getSingleConstraintMatchWeight(AsmOperandInfo & info,const char * constraint) const31286   X86TargetLowering::getSingleConstraintMatchWeight(
31287     AsmOperandInfo &info, const char *constraint) const {
31288   ConstraintWeight weight = CW_Invalid;
31289   Value *CallOperandVal = info.CallOperandVal;
31290     // If we don't have a value, we can't do a match,
31291     // but allow it at the lowest weight.
31292   if (!CallOperandVal)
31293     return CW_Default;
31294   Type *type = CallOperandVal->getType();
31295   // Look at the constraint type.
31296   switch (*constraint) {
31297   default:
31298     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
31299   case 'R':
31300   case 'q':
31301   case 'Q':
31302   case 'a':
31303   case 'b':
31304   case 'c':
31305   case 'd':
31306   case 'S':
31307   case 'D':
31308   case 'A':
31309     if (CallOperandVal->getType()->isIntegerTy())
31310       weight = CW_SpecificReg;
31311     break;
31312   case 'f':
31313   case 't':
31314   case 'u':
31315     if (type->isFloatingPointTy())
31316       weight = CW_SpecificReg;
31317     break;
31318   case 'y':
31319     if (type->isX86_MMXTy() && Subtarget.hasMMX())
31320       weight = CW_SpecificReg;
31321     break;
31322   case 'x':
31323   case 'Y':
31324     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
31325         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
31326       weight = CW_Register;
31327     break;
31328   case 'I':
31329     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
31330       if (C->getZExtValue() <= 31)
31331         weight = CW_Constant;
31332     }
31333     break;
31334   case 'J':
31335     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31336       if (C->getZExtValue() <= 63)
31337         weight = CW_Constant;
31338     }
31339     break;
31340   case 'K':
31341     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31342       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
31343         weight = CW_Constant;
31344     }
31345     break;
31346   case 'L':
31347     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31348       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
31349         weight = CW_Constant;
31350     }
31351     break;
31352   case 'M':
31353     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31354       if (C->getZExtValue() <= 3)
31355         weight = CW_Constant;
31356     }
31357     break;
31358   case 'N':
31359     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31360       if (C->getZExtValue() <= 0xff)
31361         weight = CW_Constant;
31362     }
31363     break;
31364   case 'G':
31365   case 'C':
31366     if (isa<ConstantFP>(CallOperandVal)) {
31367       weight = CW_Constant;
31368     }
31369     break;
31370   case 'e':
31371     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31372       if ((C->getSExtValue() >= -0x80000000LL) &&
31373           (C->getSExtValue() <= 0x7fffffffLL))
31374         weight = CW_Constant;
31375     }
31376     break;
31377   case 'Z':
31378     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31379       if (C->getZExtValue() <= 0xffffffff)
31380         weight = CW_Constant;
31381     }
31382     break;
31383   }
31384   return weight;
31385 }
31386 
31387 /// Try to replace an X constraint, which matches anything, with another that
31388 /// has more specific requirements based on the type of the corresponding
31389 /// operand.
31390 const char *X86TargetLowering::
LowerXConstraint(EVT ConstraintVT) const31391 LowerXConstraint(EVT ConstraintVT) const {
31392   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
31393   // 'f' like normal targets.
31394   if (ConstraintVT.isFloatingPoint()) {
31395     if (Subtarget.hasSSE2())
31396       return "Y";
31397     if (Subtarget.hasSSE1())
31398       return "x";
31399   }
31400 
31401   return TargetLowering::LowerXConstraint(ConstraintVT);
31402 }
31403 
31404 /// Lower the specified operand into the Ops vector.
31405 /// If it is invalid, don't add anything to Ops.
LowerAsmOperandForConstraint(SDValue Op,std::string & Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const31406 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
31407                                                      std::string &Constraint,
31408                                                      std::vector<SDValue>&Ops,
31409                                                      SelectionDAG &DAG) const {
31410   SDValue Result;
31411 
31412   // Only support length 1 constraints for now.
31413   if (Constraint.length() > 1) return;
31414 
31415   char ConstraintLetter = Constraint[0];
31416   switch (ConstraintLetter) {
31417   default: break;
31418   case 'I':
31419     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31420       if (C->getZExtValue() <= 31) {
31421         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31422                                        Op.getValueType());
31423         break;
31424       }
31425     }
31426     return;
31427   case 'J':
31428     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31429       if (C->getZExtValue() <= 63) {
31430         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31431                                        Op.getValueType());
31432         break;
31433       }
31434     }
31435     return;
31436   case 'K':
31437     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31438       if (isInt<8>(C->getSExtValue())) {
31439         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31440                                        Op.getValueType());
31441         break;
31442       }
31443     }
31444     return;
31445   case 'L':
31446     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31447       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
31448           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
31449         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
31450                                        Op.getValueType());
31451         break;
31452       }
31453     }
31454     return;
31455   case 'M':
31456     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31457       if (C->getZExtValue() <= 3) {
31458         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31459                                        Op.getValueType());
31460         break;
31461       }
31462     }
31463     return;
31464   case 'N':
31465     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31466       if (C->getZExtValue() <= 255) {
31467         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31468                                        Op.getValueType());
31469         break;
31470       }
31471     }
31472     return;
31473   case 'O':
31474     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31475       if (C->getZExtValue() <= 127) {
31476         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31477                                        Op.getValueType());
31478         break;
31479       }
31480     }
31481     return;
31482   case 'e': {
31483     // 32-bit signed value
31484     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31485       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31486                                            C->getSExtValue())) {
31487         // Widen to 64 bits here to get it sign extended.
31488         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
31489         break;
31490       }
31491     // FIXME gcc accepts some relocatable values here too, but only in certain
31492     // memory models; it's complicated.
31493     }
31494     return;
31495   }
31496   case 'Z': {
31497     // 32-bit unsigned value
31498     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31499       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31500                                            C->getZExtValue())) {
31501         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31502                                        Op.getValueType());
31503         break;
31504       }
31505     }
31506     // FIXME gcc accepts some relocatable values here too, but only in certain
31507     // memory models; it's complicated.
31508     return;
31509   }
31510   case 'i': {
31511     // Literal immediates are always ok.
31512     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
31513       // Widen to 64 bits here to get it sign extended.
31514       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
31515       break;
31516     }
31517 
31518     // In any sort of PIC mode addresses need to be computed at runtime by
31519     // adding in a register or some sort of table lookup.  These can't
31520     // be used as immediates.
31521     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
31522       return;
31523 
31524     // If we are in non-pic codegen mode, we allow the address of a global (with
31525     // an optional displacement) to be used with 'i'.
31526     GlobalAddressSDNode *GA = nullptr;
31527     int64_t Offset = 0;
31528 
31529     // Match either (GA), (GA+C), (GA+C1+C2), etc.
31530     while (1) {
31531       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
31532         Offset += GA->getOffset();
31533         break;
31534       } else if (Op.getOpcode() == ISD::ADD) {
31535         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31536           Offset += C->getZExtValue();
31537           Op = Op.getOperand(0);
31538           continue;
31539         }
31540       } else if (Op.getOpcode() == ISD::SUB) {
31541         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31542           Offset += -C->getZExtValue();
31543           Op = Op.getOperand(0);
31544           continue;
31545         }
31546       }
31547 
31548       // Otherwise, this isn't something we can handle, reject it.
31549       return;
31550     }
31551 
31552     const GlobalValue *GV = GA->getGlobal();
31553     // If we require an extra load to get this address, as in PIC mode, we
31554     // can't accept it.
31555     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
31556       return;
31557 
31558     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
31559                                         GA->getValueType(0), Offset);
31560     break;
31561   }
31562   }
31563 
31564   if (Result.getNode()) {
31565     Ops.push_back(Result);
31566     return;
31567   }
31568   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
31569 }
31570 
31571 /// Check if \p RC is a general purpose register class.
31572 /// I.e., GR* or one of their variant.
isGRClass(const TargetRegisterClass & RC)31573 static bool isGRClass(const TargetRegisterClass &RC) {
31574   switch (RC.getID()) {
31575   case X86::GR8RegClassID:
31576   case X86::GR8_ABCD_LRegClassID:
31577   case X86::GR8_ABCD_HRegClassID:
31578   case X86::GR8_NOREXRegClassID:
31579   case X86::GR16RegClassID:
31580   case X86::GR16_ABCDRegClassID:
31581   case X86::GR16_NOREXRegClassID:
31582   case X86::GR32RegClassID:
31583   case X86::GR32_ABCDRegClassID:
31584   case X86::GR32_TCRegClassID:
31585   case X86::GR32_NOREXRegClassID:
31586   case X86::GR32_NOAXRegClassID:
31587   case X86::GR32_NOSPRegClassID:
31588   case X86::GR32_NOREX_NOSPRegClassID:
31589   case X86::GR32_ADRegClassID:
31590   case X86::GR64RegClassID:
31591   case X86::GR64_ABCDRegClassID:
31592   case X86::GR64_TCRegClassID:
31593   case X86::GR64_TCW64RegClassID:
31594   case X86::GR64_NOREXRegClassID:
31595   case X86::GR64_NOSPRegClassID:
31596   case X86::GR64_NOREX_NOSPRegClassID:
31597   case X86::LOW32_ADDR_ACCESSRegClassID:
31598   case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
31599     return true;
31600   default:
31601     return false;
31602   }
31603 }
31604 
31605 /// Check if \p RC is a vector register class.
31606 /// I.e., FR* / VR* or one of their variant.
isFRClass(const TargetRegisterClass & RC)31607 static bool isFRClass(const TargetRegisterClass &RC) {
31608   switch (RC.getID()) {
31609   case X86::FR32RegClassID:
31610   case X86::FR32XRegClassID:
31611   case X86::FR64RegClassID:
31612   case X86::FR64XRegClassID:
31613   case X86::FR128RegClassID:
31614   case X86::VR64RegClassID:
31615   case X86::VR128RegClassID:
31616   case X86::VR128LRegClassID:
31617   case X86::VR128HRegClassID:
31618   case X86::VR128XRegClassID:
31619   case X86::VR256RegClassID:
31620   case X86::VR256LRegClassID:
31621   case X86::VR256HRegClassID:
31622   case X86::VR256XRegClassID:
31623   case X86::VR512RegClassID:
31624     return true;
31625   default:
31626     return false;
31627   }
31628 }
31629 
31630 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI,StringRef Constraint,MVT VT) const31631 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
31632                                                 StringRef Constraint,
31633                                                 MVT VT) const {
31634   // First, see if this is a constraint that directly corresponds to an LLVM
31635   // register class.
31636   if (Constraint.size() == 1) {
31637     // GCC Constraint Letters
31638     switch (Constraint[0]) {
31639     default: break;
31640       // TODO: Slight differences here in allocation order and leaving
31641       // RIP in the class. Do they matter any more here than they do
31642       // in the normal allocation?
31643     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
31644       if (Subtarget.is64Bit()) {
31645         if (VT == MVT::i32 || VT == MVT::f32)
31646           return std::make_pair(0U, &X86::GR32RegClass);
31647         if (VT == MVT::i16)
31648           return std::make_pair(0U, &X86::GR16RegClass);
31649         if (VT == MVT::i8 || VT == MVT::i1)
31650           return std::make_pair(0U, &X86::GR8RegClass);
31651         if (VT == MVT::i64 || VT == MVT::f64)
31652           return std::make_pair(0U, &X86::GR64RegClass);
31653         break;
31654       }
31655       // 32-bit fallthrough
31656     case 'Q':   // Q_REGS
31657       if (VT == MVT::i32 || VT == MVT::f32)
31658         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
31659       if (VT == MVT::i16)
31660         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
31661       if (VT == MVT::i8 || VT == MVT::i1)
31662         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
31663       if (VT == MVT::i64)
31664         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
31665       break;
31666     case 'r':   // GENERAL_REGS
31667     case 'l':   // INDEX_REGS
31668       if (VT == MVT::i8 || VT == MVT::i1)
31669         return std::make_pair(0U, &X86::GR8RegClass);
31670       if (VT == MVT::i16)
31671         return std::make_pair(0U, &X86::GR16RegClass);
31672       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
31673         return std::make_pair(0U, &X86::GR32RegClass);
31674       return std::make_pair(0U, &X86::GR64RegClass);
31675     case 'R':   // LEGACY_REGS
31676       if (VT == MVT::i8 || VT == MVT::i1)
31677         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
31678       if (VT == MVT::i16)
31679         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
31680       if (VT == MVT::i32 || !Subtarget.is64Bit())
31681         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
31682       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
31683     case 'f':  // FP Stack registers.
31684       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
31685       // value to the correct fpstack register class.
31686       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
31687         return std::make_pair(0U, &X86::RFP32RegClass);
31688       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
31689         return std::make_pair(0U, &X86::RFP64RegClass);
31690       return std::make_pair(0U, &X86::RFP80RegClass);
31691     case 'y':   // MMX_REGS if MMX allowed.
31692       if (!Subtarget.hasMMX()) break;
31693       return std::make_pair(0U, &X86::VR64RegClass);
31694     case 'Y':   // SSE_REGS if SSE2 allowed
31695       if (!Subtarget.hasSSE2()) break;
31696       // FALL THROUGH.
31697     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
31698       if (!Subtarget.hasSSE1()) break;
31699 
31700       switch (VT.SimpleTy) {
31701       default: break;
31702       // Scalar SSE types.
31703       case MVT::f32:
31704       case MVT::i32:
31705         return std::make_pair(0U, &X86::FR32RegClass);
31706       case MVT::f64:
31707       case MVT::i64:
31708         return std::make_pair(0U, &X86::FR64RegClass);
31709       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31710       // Vector types.
31711       case MVT::v16i8:
31712       case MVT::v8i16:
31713       case MVT::v4i32:
31714       case MVT::v2i64:
31715       case MVT::v4f32:
31716       case MVT::v2f64:
31717         return std::make_pair(0U, &X86::VR128RegClass);
31718       // AVX types.
31719       case MVT::v32i8:
31720       case MVT::v16i16:
31721       case MVT::v8i32:
31722       case MVT::v4i64:
31723       case MVT::v8f32:
31724       case MVT::v4f64:
31725         return std::make_pair(0U, &X86::VR256RegClass);
31726       case MVT::v8f64:
31727       case MVT::v16f32:
31728       case MVT::v16i32:
31729       case MVT::v8i64:
31730         return std::make_pair(0U, &X86::VR512RegClass);
31731       }
31732       break;
31733     }
31734   }
31735 
31736   // Use the default implementation in TargetLowering to convert the register
31737   // constraint into a member of a register class.
31738   std::pair<unsigned, const TargetRegisterClass*> Res;
31739   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
31740 
31741   // Not found as a standard register?
31742   if (!Res.second) {
31743     // Map st(0) -> st(7) -> ST0
31744     if (Constraint.size() == 7 && Constraint[0] == '{' &&
31745         tolower(Constraint[1]) == 's' &&
31746         tolower(Constraint[2]) == 't' &&
31747         Constraint[3] == '(' &&
31748         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
31749         Constraint[5] == ')' &&
31750         Constraint[6] == '}') {
31751 
31752       Res.first = X86::FP0+Constraint[4]-'0';
31753       Res.second = &X86::RFP80RegClass;
31754       return Res;
31755     }
31756 
31757     // GCC allows "st(0)" to be called just plain "st".
31758     if (StringRef("{st}").equals_lower(Constraint)) {
31759       Res.first = X86::FP0;
31760       Res.second = &X86::RFP80RegClass;
31761       return Res;
31762     }
31763 
31764     // flags -> EFLAGS
31765     if (StringRef("{flags}").equals_lower(Constraint)) {
31766       Res.first = X86::EFLAGS;
31767       Res.second = &X86::CCRRegClass;
31768       return Res;
31769     }
31770 
31771     // 'A' means EAX + EDX.
31772     if (Constraint == "A") {
31773       Res.first = X86::EAX;
31774       Res.second = &X86::GR32_ADRegClass;
31775       return Res;
31776     }
31777     return Res;
31778   }
31779 
31780   // Otherwise, check to see if this is a register class of the wrong value
31781   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
31782   // turn into {ax},{dx}.
31783   // MVT::Other is used to specify clobber names.
31784   if (Res.second->hasType(VT) || VT == MVT::Other)
31785     return Res;   // Correct type already, nothing to do.
31786 
31787   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
31788   // return "eax". This should even work for things like getting 64bit integer
31789   // registers when given an f64 type.
31790   const TargetRegisterClass *Class = Res.second;
31791   // The generic code will match the first register class that contains the
31792   // given register. Thus, based on the ordering of the tablegened file,
31793   // the "plain" GR classes might not come first.
31794   // Therefore, use a helper method.
31795   if (isGRClass(*Class)) {
31796     unsigned Size = VT.getSizeInBits();
31797     if (Size == 1) Size = 8;
31798     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
31799     if (DestReg > 0) {
31800       Res.first = DestReg;
31801       Res.second = Size == 8 ? &X86::GR8RegClass
31802                  : Size == 16 ? &X86::GR16RegClass
31803                  : Size == 32 ? &X86::GR32RegClass
31804                  : &X86::GR64RegClass;
31805       assert(Res.second->contains(Res.first) && "Register in register class");
31806     } else {
31807       // No register found/type mismatch.
31808       Res.first = 0;
31809       Res.second = nullptr;
31810     }
31811   } else if (isFRClass(*Class)) {
31812     // Handle references to XMM physical registers that got mapped into the
31813     // wrong class.  This can happen with constraints like {xmm0} where the
31814     // target independent register mapper will just pick the first match it can
31815     // find, ignoring the required type.
31816 
31817     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31818     if (VT == MVT::f32 || VT == MVT::i32)
31819       Res.second = &X86::FR32RegClass;
31820     else if (VT == MVT::f64 || VT == MVT::i64)
31821       Res.second = &X86::FR64RegClass;
31822     else if (X86::VR128RegClass.hasType(VT))
31823       Res.second = &X86::VR128RegClass;
31824     else if (X86::VR256RegClass.hasType(VT))
31825       Res.second = &X86::VR256RegClass;
31826     else if (X86::VR512RegClass.hasType(VT))
31827       Res.second = &X86::VR512RegClass;
31828     else {
31829       // Type mismatch and not a clobber: Return an error;
31830       Res.first = 0;
31831       Res.second = nullptr;
31832     }
31833   }
31834 
31835   return Res;
31836 }
31837 
getScalingFactorCost(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS) const31838 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
31839                                             const AddrMode &AM, Type *Ty,
31840                                             unsigned AS) const {
31841   // Scaling factors are not free at all.
31842   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
31843   // will take 2 allocations in the out of order engine instead of 1
31844   // for plain addressing mode, i.e. inst (reg1).
31845   // E.g.,
31846   // vaddps (%rsi,%drx), %ymm0, %ymm1
31847   // Requires two allocations (one for the load, one for the computation)
31848   // whereas:
31849   // vaddps (%rsi), %ymm0, %ymm1
31850   // Requires just 1 allocation, i.e., freeing allocations for other operations
31851   // and having less micro operations to execute.
31852   //
31853   // For some X86 architectures, this is even worse because for instance for
31854   // stores, the complex addressing mode forces the instruction to use the
31855   // "load" ports instead of the dedicated "store" port.
31856   // E.g., on Haswell:
31857   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
31858   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
31859   if (isLegalAddressingMode(DL, AM, Ty, AS))
31860     // Scale represents reg2 * scale, thus account for 1
31861     // as soon as we use a second register.
31862     return AM.Scale != 0;
31863   return -1;
31864 }
31865 
isIntDivCheap(EVT VT,AttributeSet Attr) const31866 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
31867   // Integer division on x86 is expensive. However, when aggressively optimizing
31868   // for code size, we prefer to use a div instruction, as it is usually smaller
31869   // than the alternative sequence.
31870   // The exception to this is vector division. Since x86 doesn't have vector
31871   // integer division, leaving the division as-is is a loss even in terms of
31872   // size, because it will have to be scalarized, while the alternative code
31873   // sequence can be performed in vector form.
31874   bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
31875                                    Attribute::MinSize);
31876   return OptSize && !VT.isVector();
31877 }
31878 
initializeSplitCSR(MachineBasicBlock * Entry) const31879 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
31880   if (!Subtarget.is64Bit())
31881     return;
31882 
31883   // Update IsSplitCSR in X86MachineFunctionInfo.
31884   X86MachineFunctionInfo *AFI =
31885     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
31886   AFI->setIsSplitCSR(true);
31887 }
31888 
insertCopiesSplitCSR(MachineBasicBlock * Entry,const SmallVectorImpl<MachineBasicBlock * > & Exits) const31889 void X86TargetLowering::insertCopiesSplitCSR(
31890     MachineBasicBlock *Entry,
31891     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
31892   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
31893   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
31894   if (!IStart)
31895     return;
31896 
31897   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31898   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
31899   MachineBasicBlock::iterator MBBI = Entry->begin();
31900   for (const MCPhysReg *I = IStart; *I; ++I) {
31901     const TargetRegisterClass *RC = nullptr;
31902     if (X86::GR64RegClass.contains(*I))
31903       RC = &X86::GR64RegClass;
31904     else
31905       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
31906 
31907     unsigned NewVR = MRI->createVirtualRegister(RC);
31908     // Create copy from CSR to a virtual register.
31909     // FIXME: this currently does not emit CFI pseudo-instructions, it works
31910     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
31911     // nounwind. If we want to generalize this later, we may need to emit
31912     // CFI pseudo-instructions.
31913     assert(Entry->getParent()->getFunction()->hasFnAttribute(
31914                Attribute::NoUnwind) &&
31915            "Function should be nounwind in insertCopiesSplitCSR!");
31916     Entry->addLiveIn(*I);
31917     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
31918         .addReg(*I);
31919 
31920     // Insert the copy-back instructions right before the terminator.
31921     for (auto *Exit : Exits)
31922       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
31923               TII->get(TargetOpcode::COPY), *I)
31924           .addReg(NewVR);
31925   }
31926 }
31927