• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- InstCombineVectorOps.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements instcombine for ExtractElement, InsertElement and
10 // ShuffleVector.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "InstCombineInternal.h"
15 #include "llvm/ADT/APInt.h"
16 #include "llvm/ADT/ArrayRef.h"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallBitVector.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/InstructionSimplify.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/Constant.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/Operator.h"
32 #include "llvm/IR/PatternMatch.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/IR/User.h"
35 #include "llvm/IR/Value.h"
36 #include "llvm/Support/Casting.h"
37 #include "llvm/Support/ErrorHandling.h"
38 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
39 #include "llvm/Transforms/InstCombine/InstCombiner.h"
40 #include <cassert>
41 #include <cstdint>
42 #include <iterator>
43 #include <utility>
44 
45 using namespace llvm;
46 using namespace PatternMatch;
47 
48 #define DEBUG_TYPE "instcombine"
49 
50 STATISTIC(NumAggregateReconstructionsSimplified,
51           "Number of aggregate reconstructions turned into reuse of the "
52           "original aggregate");
53 
54 /// Return true if the value is cheaper to scalarize than it is to leave as a
55 /// vector operation. IsConstantExtractIndex indicates whether we are extracting
56 /// one known element from a vector constant.
57 ///
58 /// FIXME: It's possible to create more instructions than previously existed.
cheapToScalarize(Value * V,bool IsConstantExtractIndex)59 static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
60   // If we can pick a scalar constant value out of a vector, that is free.
61   if (auto *C = dyn_cast<Constant>(V))
62     return IsConstantExtractIndex || C->getSplatValue();
63 
64   // An insertelement to the same constant index as our extract will simplify
65   // to the scalar inserted element. An insertelement to a different constant
66   // index is irrelevant to our extract.
67   if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))
68     return IsConstantExtractIndex;
69 
70   if (match(V, m_OneUse(m_Load(m_Value()))))
71     return true;
72 
73   if (match(V, m_OneUse(m_UnOp())))
74     return true;
75 
76   Value *V0, *V1;
77   if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
78     if (cheapToScalarize(V0, IsConstantExtractIndex) ||
79         cheapToScalarize(V1, IsConstantExtractIndex))
80       return true;
81 
82   CmpInst::Predicate UnusedPred;
83   if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
84     if (cheapToScalarize(V0, IsConstantExtractIndex) ||
85         cheapToScalarize(V1, IsConstantExtractIndex))
86       return true;
87 
88   return false;
89 }
90 
91 // If we have a PHI node with a vector type that is only used to feed
92 // itself and be an operand of extractelement at a constant location,
93 // try to replace the PHI of the vector type with a PHI of a scalar type.
scalarizePHI(ExtractElementInst & EI,PHINode * PN)94 Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
95                                             PHINode *PN) {
96   SmallVector<Instruction *, 2> Extracts;
97   // The users we want the PHI to have are:
98   // 1) The EI ExtractElement (we already know this)
99   // 2) Possibly more ExtractElements with the same index.
100   // 3) Another operand, which will feed back into the PHI.
101   Instruction *PHIUser = nullptr;
102   for (auto U : PN->users()) {
103     if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {
104       if (EI.getIndexOperand() == EU->getIndexOperand())
105         Extracts.push_back(EU);
106       else
107         return nullptr;
108     } else if (!PHIUser) {
109       PHIUser = cast<Instruction>(U);
110     } else {
111       return nullptr;
112     }
113   }
114 
115   if (!PHIUser)
116     return nullptr;
117 
118   // Verify that this PHI user has one use, which is the PHI itself,
119   // and that it is a binary operation which is cheap to scalarize.
120   // otherwise return nullptr.
121   if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
122       !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
123     return nullptr;
124 
125   // Create a scalar PHI node that will replace the vector PHI node
126   // just before the current PHI node.
127   PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
128       PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
129   // Scalarize each PHI operand.
130   for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
131     Value *PHIInVal = PN->getIncomingValue(i);
132     BasicBlock *inBB = PN->getIncomingBlock(i);
133     Value *Elt = EI.getIndexOperand();
134     // If the operand is the PHI induction variable:
135     if (PHIInVal == PHIUser) {
136       // Scalarize the binary operation. Its first operand is the
137       // scalar PHI, and the second operand is extracted from the other
138       // vector operand.
139       BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
140       unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
141       Value *Op = InsertNewInstWith(
142           ExtractElementInst::Create(B0->getOperand(opId), Elt,
143                                      B0->getOperand(opId)->getName() + ".Elt"),
144           *B0);
145       Value *newPHIUser = InsertNewInstWith(
146           BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
147                                                 scalarPHI, Op, B0), *B0);
148       scalarPHI->addIncoming(newPHIUser, inBB);
149     } else {
150       // Scalarize PHI input:
151       Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
152       // Insert the new instruction into the predecessor basic block.
153       Instruction *pos = dyn_cast<Instruction>(PHIInVal);
154       BasicBlock::iterator InsertPos;
155       if (pos && !isa<PHINode>(pos)) {
156         InsertPos = ++pos->getIterator();
157       } else {
158         InsertPos = inBB->getFirstInsertionPt();
159       }
160 
161       InsertNewInstWith(newEI, *InsertPos);
162 
163       scalarPHI->addIncoming(newEI, inBB);
164     }
165   }
166 
167   for (auto E : Extracts)
168     replaceInstUsesWith(*E, scalarPHI);
169 
170   return &EI;
171 }
172 
foldBitcastExtElt(ExtractElementInst & Ext,InstCombiner::BuilderTy & Builder,bool IsBigEndian)173 static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
174                                       InstCombiner::BuilderTy &Builder,
175                                       bool IsBigEndian) {
176   Value *X;
177   uint64_t ExtIndexC;
178   if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) ||
179       !X->getType()->isVectorTy() ||
180       !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC)))
181     return nullptr;
182 
183   // If this extractelement is using a bitcast from a vector of the same number
184   // of elements, see if we can find the source element from the source vector:
185   // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
186   auto *SrcTy = cast<VectorType>(X->getType());
187   Type *DestTy = Ext.getType();
188   ElementCount NumSrcElts = SrcTy->getElementCount();
189   ElementCount NumElts =
190       cast<VectorType>(Ext.getVectorOperandType())->getElementCount();
191   if (NumSrcElts == NumElts)
192     if (Value *Elt = findScalarElement(X, ExtIndexC))
193       return new BitCastInst(Elt, DestTy);
194 
195   assert(NumSrcElts.isScalable() == NumElts.isScalable() &&
196          "Src and Dst must be the same sort of vector type");
197 
198   // If the source elements are wider than the destination, try to shift and
199   // truncate a subset of scalar bits of an insert op.
200   if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) {
201     Value *Scalar;
202     uint64_t InsIndexC;
203     if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar),
204                               m_ConstantInt(InsIndexC))))
205       return nullptr;
206 
207     // The extract must be from the subset of vector elements that we inserted
208     // into. Example: if we inserted element 1 of a <2 x i64> and we are
209     // extracting an i16 (narrowing ratio = 4), then this extract must be from 1
210     // of elements 4-7 of the bitcasted vector.
211     unsigned NarrowingRatio =
212         NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue();
213     if (ExtIndexC / NarrowingRatio != InsIndexC)
214       return nullptr;
215 
216     // We are extracting part of the original scalar. How that scalar is
217     // inserted into the vector depends on the endian-ness. Example:
218     //              Vector Byte Elt Index:    0  1  2  3  4  5  6  7
219     //                                       +--+--+--+--+--+--+--+--+
220     // inselt <2 x i32> V, <i32> S, 1:       |V0|V1|V2|V3|S0|S1|S2|S3|
221     // extelt <4 x i16> V', 3:               |                 |S2|S3|
222     //                                       +--+--+--+--+--+--+--+--+
223     // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value.
224     // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value.
225     // In this example, we must right-shift little-endian. Big-endian is just a
226     // truncate.
227     unsigned Chunk = ExtIndexC % NarrowingRatio;
228     if (IsBigEndian)
229       Chunk = NarrowingRatio - 1 - Chunk;
230 
231     // Bail out if this is an FP vector to FP vector sequence. That would take
232     // more instructions than we started with unless there is no shift, and it
233     // may not be handled as well in the backend.
234     bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy();
235     bool NeedDestBitcast = DestTy->isFloatingPointTy();
236     if (NeedSrcBitcast && NeedDestBitcast)
237       return nullptr;
238 
239     unsigned SrcWidth = SrcTy->getScalarSizeInBits();
240     unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
241     unsigned ShAmt = Chunk * DestWidth;
242 
243     // TODO: This limitation is more strict than necessary. We could sum the
244     // number of new instructions and subtract the number eliminated to know if
245     // we can proceed.
246     if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse())
247       if (NeedSrcBitcast || NeedDestBitcast)
248         return nullptr;
249 
250     if (NeedSrcBitcast) {
251       Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth);
252       Scalar = Builder.CreateBitCast(Scalar, SrcIntTy);
253     }
254 
255     if (ShAmt) {
256       // Bail out if we could end with more instructions than we started with.
257       if (!Ext.getVectorOperand()->hasOneUse())
258         return nullptr;
259       Scalar = Builder.CreateLShr(Scalar, ShAmt);
260     }
261 
262     if (NeedDestBitcast) {
263       Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth);
264       return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy);
265     }
266     return new TruncInst(Scalar, DestTy);
267   }
268 
269   return nullptr;
270 }
271 
272 /// Find elements of V demanded by UserInstr.
findDemandedEltsBySingleUser(Value * V,Instruction * UserInstr)273 static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
274   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
275 
276   // Conservatively assume that all elements are needed.
277   APInt UsedElts(APInt::getAllOnesValue(VWidth));
278 
279   switch (UserInstr->getOpcode()) {
280   case Instruction::ExtractElement: {
281     ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
282     assert(EEI->getVectorOperand() == V);
283     ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
284     if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
285       UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
286     }
287     break;
288   }
289   case Instruction::ShuffleVector: {
290     ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
291     unsigned MaskNumElts =
292         cast<FixedVectorType>(UserInstr->getType())->getNumElements();
293 
294     UsedElts = APInt(VWidth, 0);
295     for (unsigned i = 0; i < MaskNumElts; i++) {
296       unsigned MaskVal = Shuffle->getMaskValue(i);
297       if (MaskVal == -1u || MaskVal >= 2 * VWidth)
298         continue;
299       if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
300         UsedElts.setBit(MaskVal);
301       if (Shuffle->getOperand(1) == V &&
302           ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
303         UsedElts.setBit(MaskVal - VWidth);
304     }
305     break;
306   }
307   default:
308     break;
309   }
310   return UsedElts;
311 }
312 
313 /// Find union of elements of V demanded by all its users.
314 /// If it is known by querying findDemandedEltsBySingleUser that
315 /// no user demands an element of V, then the corresponding bit
316 /// remains unset in the returned value.
findDemandedEltsByAllUsers(Value * V)317 static APInt findDemandedEltsByAllUsers(Value *V) {
318   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
319 
320   APInt UnionUsedElts(VWidth, 0);
321   for (const Use &U : V->uses()) {
322     if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
323       UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
324     } else {
325       UnionUsedElts = APInt::getAllOnesValue(VWidth);
326       break;
327     }
328 
329     if (UnionUsedElts.isAllOnesValue())
330       break;
331   }
332 
333   return UnionUsedElts;
334 }
335 
visitExtractElementInst(ExtractElementInst & EI)336 Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
337   Value *SrcVec = EI.getVectorOperand();
338   Value *Index = EI.getIndexOperand();
339   if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
340                                             SQ.getWithInstruction(&EI)))
341     return replaceInstUsesWith(EI, V);
342 
343   // If extracting a specified index from the vector, see if we can recursively
344   // find a previously computed scalar that was inserted into the vector.
345   auto *IndexC = dyn_cast<ConstantInt>(Index);
346   if (IndexC) {
347     ElementCount EC = EI.getVectorOperandType()->getElementCount();
348     unsigned NumElts = EC.getKnownMinValue();
349 
350     // InstSimplify should handle cases where the index is invalid.
351     // For fixed-length vector, it's invalid to extract out-of-range element.
352     if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
353       return nullptr;
354 
355     // This instruction only demands the single element from the input vector.
356     // Skip for scalable type, the number of elements is unknown at
357     // compile-time.
358     if (!EC.isScalable() && NumElts != 1) {
359       // If the input vector has a single use, simplify it based on this use
360       // property.
361       if (SrcVec->hasOneUse()) {
362         APInt UndefElts(NumElts, 0);
363         APInt DemandedElts(NumElts, 0);
364         DemandedElts.setBit(IndexC->getZExtValue());
365         if (Value *V =
366                 SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
367           return replaceOperand(EI, 0, V);
368       } else {
369         // If the input vector has multiple uses, simplify it based on a union
370         // of all elements used.
371         APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
372         if (!DemandedElts.isAllOnesValue()) {
373           APInt UndefElts(NumElts, 0);
374           if (Value *V = SimplifyDemandedVectorElts(
375                   SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
376                   true /* AllowMultipleUsers */)) {
377             if (V != SrcVec) {
378               SrcVec->replaceAllUsesWith(V);
379               return &EI;
380             }
381           }
382         }
383       }
384     }
385     if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
386       return I;
387 
388     // If there's a vector PHI feeding a scalar use through this extractelement
389     // instruction, try to scalarize the PHI.
390     if (auto *Phi = dyn_cast<PHINode>(SrcVec))
391       if (Instruction *ScalarPHI = scalarizePHI(EI, Phi))
392         return ScalarPHI;
393   }
394 
395   // TODO come up with a n-ary matcher that subsumes both unary and
396   // binary matchers.
397   UnaryOperator *UO;
398   if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) {
399     // extelt (unop X), Index --> unop (extelt X, Index)
400     Value *X = UO->getOperand(0);
401     Value *E = Builder.CreateExtractElement(X, Index);
402     return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO);
403   }
404 
405   BinaryOperator *BO;
406   if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
407     // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
408     Value *X = BO->getOperand(0), *Y = BO->getOperand(1);
409     Value *E0 = Builder.CreateExtractElement(X, Index);
410     Value *E1 = Builder.CreateExtractElement(Y, Index);
411     return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO);
412   }
413 
414   Value *X, *Y;
415   CmpInst::Predicate Pred;
416   if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
417       cheapToScalarize(SrcVec, IndexC)) {
418     // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
419     Value *E0 = Builder.CreateExtractElement(X, Index);
420     Value *E1 = Builder.CreateExtractElement(Y, Index);
421     return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1);
422   }
423 
424   if (auto *I = dyn_cast<Instruction>(SrcVec)) {
425     if (auto *IE = dyn_cast<InsertElementInst>(I)) {
426       // Extracting the inserted element?
427       if (IE->getOperand(2) == Index)
428         return replaceInstUsesWith(EI, IE->getOperand(1));
429       // If the inserted and extracted elements are constants, they must not
430       // be the same value, extract from the pre-inserted value instead.
431       if (isa<Constant>(IE->getOperand(2)) && IndexC)
432         return replaceOperand(EI, 0, IE->getOperand(0));
433     } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
434       // If this is extracting an element from a shufflevector, figure out where
435       // it came from and extract from the appropriate input element instead.
436       // Restrict the following transformation to fixed-length vector.
437       if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) {
438         int SrcIdx =
439             SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue());
440         Value *Src;
441         unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType())
442                                 ->getNumElements();
443 
444         if (SrcIdx < 0)
445           return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
446         if (SrcIdx < (int)LHSWidth)
447           Src = SVI->getOperand(0);
448         else {
449           SrcIdx -= LHSWidth;
450           Src = SVI->getOperand(1);
451         }
452         Type *Int32Ty = Type::getInt32Ty(EI.getContext());
453         return ExtractElementInst::Create(
454             Src, ConstantInt::get(Int32Ty, SrcIdx, false));
455       }
456     } else if (auto *CI = dyn_cast<CastInst>(I)) {
457       // Canonicalize extractelement(cast) -> cast(extractelement).
458       // Bitcasts can change the number of vector elements, and they cost
459       // nothing.
460       if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
461         Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
462         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
463       }
464     }
465   }
466   return nullptr;
467 }
468 
469 /// If V is a shuffle of values that ONLY returns elements from either LHS or
470 /// RHS, return the shuffle mask and true. Otherwise, return false.
collectSingleShuffleElements(Value * V,Value * LHS,Value * RHS,SmallVectorImpl<int> & Mask)471 static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
472                                          SmallVectorImpl<int> &Mask) {
473   assert(LHS->getType() == RHS->getType() &&
474          "Invalid CollectSingleShuffleElements");
475   unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
476 
477   if (isa<UndefValue>(V)) {
478     Mask.assign(NumElts, -1);
479     return true;
480   }
481 
482   if (V == LHS) {
483     for (unsigned i = 0; i != NumElts; ++i)
484       Mask.push_back(i);
485     return true;
486   }
487 
488   if (V == RHS) {
489     for (unsigned i = 0; i != NumElts; ++i)
490       Mask.push_back(i + NumElts);
491     return true;
492   }
493 
494   if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
495     // If this is an insert of an extract from some other vector, include it.
496     Value *VecOp    = IEI->getOperand(0);
497     Value *ScalarOp = IEI->getOperand(1);
498     Value *IdxOp    = IEI->getOperand(2);
499 
500     if (!isa<ConstantInt>(IdxOp))
501       return false;
502     unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
503 
504     if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
505       // We can handle this if the vector we are inserting into is
506       // transitively ok.
507       if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
508         // If so, update the mask to reflect the inserted undef.
509         Mask[InsertedIdx] = -1;
510         return true;
511       }
512     } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
513       if (isa<ConstantInt>(EI->getOperand(1))) {
514         unsigned ExtractedIdx =
515         cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
516         unsigned NumLHSElts =
517             cast<FixedVectorType>(LHS->getType())->getNumElements();
518 
519         // This must be extracting from either LHS or RHS.
520         if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
521           // We can handle this if the vector we are inserting into is
522           // transitively ok.
523           if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
524             // If so, update the mask to reflect the inserted value.
525             if (EI->getOperand(0) == LHS) {
526               Mask[InsertedIdx % NumElts] = ExtractedIdx;
527             } else {
528               assert(EI->getOperand(0) == RHS);
529               Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts;
530             }
531             return true;
532           }
533         }
534       }
535     }
536   }
537 
538   return false;
539 }
540 
541 /// If we have insertion into a vector that is wider than the vector that we
542 /// are extracting from, try to widen the source vector to allow a single
543 /// shufflevector to replace one or more insert/extract pairs.
replaceExtractElements(InsertElementInst * InsElt,ExtractElementInst * ExtElt,InstCombinerImpl & IC)544 static void replaceExtractElements(InsertElementInst *InsElt,
545                                    ExtractElementInst *ExtElt,
546                                    InstCombinerImpl &IC) {
547   auto *InsVecType = cast<FixedVectorType>(InsElt->getType());
548   auto *ExtVecType = cast<FixedVectorType>(ExtElt->getVectorOperandType());
549   unsigned NumInsElts = InsVecType->getNumElements();
550   unsigned NumExtElts = ExtVecType->getNumElements();
551 
552   // The inserted-to vector must be wider than the extracted-from vector.
553   if (InsVecType->getElementType() != ExtVecType->getElementType() ||
554       NumExtElts >= NumInsElts)
555     return;
556 
557   // Create a shuffle mask to widen the extended-from vector using undefined
558   // values. The mask selects all of the values of the original vector followed
559   // by as many undefined values as needed to create a vector of the same length
560   // as the inserted-to vector.
561   SmallVector<int, 16> ExtendMask;
562   for (unsigned i = 0; i < NumExtElts; ++i)
563     ExtendMask.push_back(i);
564   for (unsigned i = NumExtElts; i < NumInsElts; ++i)
565     ExtendMask.push_back(-1);
566 
567   Value *ExtVecOp = ExtElt->getVectorOperand();
568   auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
569   BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
570                                    ? ExtVecOpInst->getParent()
571                                    : ExtElt->getParent();
572 
573   // TODO: This restriction matches the basic block check below when creating
574   // new extractelement instructions. If that limitation is removed, this one
575   // could also be removed. But for now, we just bail out to ensure that we
576   // will replace the extractelement instruction that is feeding our
577   // insertelement instruction. This allows the insertelement to then be
578   // replaced by a shufflevector. If the insertelement is not replaced, we can
579   // induce infinite looping because there's an optimization for extractelement
580   // that will delete our widening shuffle. This would trigger another attempt
581   // here to create that shuffle, and we spin forever.
582   if (InsertionBlock != InsElt->getParent())
583     return;
584 
585   // TODO: This restriction matches the check in visitInsertElementInst() and
586   // prevents an infinite loop caused by not turning the extract/insert pair
587   // into a shuffle. We really should not need either check, but we're lacking
588   // folds for shufflevectors because we're afraid to generate shuffle masks
589   // that the backend can't handle.
590   if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
591     return;
592 
593   auto *WideVec =
594       new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ExtendMask);
595 
596   // Insert the new shuffle after the vector operand of the extract is defined
597   // (as long as it's not a PHI) or at the start of the basic block of the
598   // extract, so any subsequent extracts in the same basic block can use it.
599   // TODO: Insert before the earliest ExtractElementInst that is replaced.
600   if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
601     WideVec->insertAfter(ExtVecOpInst);
602   else
603     IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
604 
605   // Replace extracts from the original narrow vector with extracts from the new
606   // wide vector.
607   for (User *U : ExtVecOp->users()) {
608     ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
609     if (!OldExt || OldExt->getParent() != WideVec->getParent())
610       continue;
611     auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
612     NewExt->insertAfter(OldExt);
613     IC.replaceInstUsesWith(*OldExt, NewExt);
614   }
615 }
616 
617 /// We are building a shuffle to create V, which is a sequence of insertelement,
618 /// extractelement pairs. If PermittedRHS is set, then we must either use it or
619 /// not rely on the second vector source. Return a std::pair containing the
620 /// left and right vectors of the proposed shuffle (or 0), and set the Mask
621 /// parameter as required.
622 ///
623 /// Note: we intentionally don't try to fold earlier shuffles since they have
624 /// often been chosen carefully to be efficiently implementable on the target.
625 using ShuffleOps = std::pair<Value *, Value *>;
626 
collectShuffleElements(Value * V,SmallVectorImpl<int> & Mask,Value * PermittedRHS,InstCombinerImpl & IC)627 static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
628                                          Value *PermittedRHS,
629                                          InstCombinerImpl &IC) {
630   assert(V->getType()->isVectorTy() && "Invalid shuffle!");
631   unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
632 
633   if (isa<UndefValue>(V)) {
634     Mask.assign(NumElts, -1);
635     return std::make_pair(
636         PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
637   }
638 
639   if (isa<ConstantAggregateZero>(V)) {
640     Mask.assign(NumElts, 0);
641     return std::make_pair(V, nullptr);
642   }
643 
644   if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
645     // If this is an insert of an extract from some other vector, include it.
646     Value *VecOp    = IEI->getOperand(0);
647     Value *ScalarOp = IEI->getOperand(1);
648     Value *IdxOp    = IEI->getOperand(2);
649 
650     if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
651       if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
652         unsigned ExtractedIdx =
653           cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
654         unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
655 
656         // Either the extracted from or inserted into vector must be RHSVec,
657         // otherwise we'd end up with a shuffle of three inputs.
658         if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
659           Value *RHS = EI->getOperand(0);
660           ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
661           assert(LR.second == nullptr || LR.second == RHS);
662 
663           if (LR.first->getType() != RHS->getType()) {
664             // Although we are giving up for now, see if we can create extracts
665             // that match the inserts for another round of combining.
666             replaceExtractElements(IEI, EI, IC);
667 
668             // We tried our best, but we can't find anything compatible with RHS
669             // further up the chain. Return a trivial shuffle.
670             for (unsigned i = 0; i < NumElts; ++i)
671               Mask[i] = i;
672             return std::make_pair(V, nullptr);
673           }
674 
675           unsigned NumLHSElts =
676               cast<FixedVectorType>(RHS->getType())->getNumElements();
677           Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;
678           return std::make_pair(LR.first, RHS);
679         }
680 
681         if (VecOp == PermittedRHS) {
682           // We've gone as far as we can: anything on the other side of the
683           // extractelement will already have been converted into a shuffle.
684           unsigned NumLHSElts =
685               cast<FixedVectorType>(EI->getOperand(0)->getType())
686                   ->getNumElements();
687           for (unsigned i = 0; i != NumElts; ++i)
688             Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);
689           return std::make_pair(EI->getOperand(0), PermittedRHS);
690         }
691 
692         // If this insertelement is a chain that comes from exactly these two
693         // vectors, return the vector and the effective shuffle.
694         if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
695             collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
696                                          Mask))
697           return std::make_pair(EI->getOperand(0), PermittedRHS);
698       }
699     }
700   }
701 
702   // Otherwise, we can't do anything fancy. Return an identity vector.
703   for (unsigned i = 0; i != NumElts; ++i)
704     Mask.push_back(i);
705   return std::make_pair(V, nullptr);
706 }
707 
708 /// Look for chain of insertvalue's that fully define an aggregate, and trace
709 /// back the values inserted, see if they are all were extractvalue'd from
710 /// the same source aggregate from the exact same element indexes.
711 /// If they were, just reuse the source aggregate.
712 /// This potentially deals with PHI indirections.
foldAggregateConstructionIntoAggregateReuse(InsertValueInst & OrigIVI)713 Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
714     InsertValueInst &OrigIVI) {
715   Type *AggTy = OrigIVI.getType();
716   unsigned NumAggElts;
717   switch (AggTy->getTypeID()) {
718   case Type::StructTyID:
719     NumAggElts = AggTy->getStructNumElements();
720     break;
721   case Type::ArrayTyID:
722     NumAggElts = AggTy->getArrayNumElements();
723     break;
724   default:
725     llvm_unreachable("Unhandled aggregate type?");
726   }
727 
728   // Arbitrary aggregate size cut-off. Motivation for limit of 2 is to be able
729   // to handle clang C++ exception struct (which is hardcoded as {i8*, i32}),
730   // FIXME: any interesting patterns to be caught with larger limit?
731   assert(NumAggElts > 0 && "Aggregate should have elements.");
732   if (NumAggElts > 2)
733     return nullptr;
734 
735   static constexpr auto NotFound = None;
736   static constexpr auto FoundMismatch = nullptr;
737 
738   // Try to find a value of each element of an aggregate.
739   // FIXME: deal with more complex, not one-dimensional, aggregate types
740   SmallVector<Optional<Value *>, 2> AggElts(NumAggElts, NotFound);
741 
742   // Do we know values for each element of the aggregate?
743   auto KnowAllElts = [&AggElts]() {
744     return all_of(AggElts,
745                   [](Optional<Value *> Elt) { return Elt != NotFound; });
746   };
747 
748   int Depth = 0;
749 
750   // Arbitrary `insertvalue` visitation depth limit. Let's be okay with
751   // every element being overwritten twice, which should never happen.
752   static const int DepthLimit = 2 * NumAggElts;
753 
754   // Recurse up the chain of `insertvalue` aggregate operands until either we've
755   // reconstructed full initializer or can't visit any more `insertvalue`'s.
756   for (InsertValueInst *CurrIVI = &OrigIVI;
757        Depth < DepthLimit && CurrIVI && !KnowAllElts();
758        CurrIVI = dyn_cast<InsertValueInst>(CurrIVI->getAggregateOperand()),
759                        ++Depth) {
760     Value *InsertedValue = CurrIVI->getInsertedValueOperand();
761     ArrayRef<unsigned int> Indices = CurrIVI->getIndices();
762 
763     // Don't bother with more than single-level aggregates.
764     if (Indices.size() != 1)
765       return nullptr; // FIXME: deal with more complex aggregates?
766 
767     // Now, we may have already previously recorded the value for this element
768     // of an aggregate. If we did, that means the CurrIVI will later be
769     // overwritten with the already-recorded value. But if not, let's record it!
770     Optional<Value *> &Elt = AggElts[Indices.front()];
771     Elt = Elt.getValueOr(InsertedValue);
772 
773     // FIXME: should we handle chain-terminating undef base operand?
774   }
775 
776   // Was that sufficient to deduce the full initializer for the aggregate?
777   if (!KnowAllElts())
778     return nullptr; // Give up then.
779 
780   // We now want to find the source[s] of the aggregate elements we've found.
781   // And with "source" we mean the original aggregate[s] from which
782   // the inserted elements were extracted. This may require PHI translation.
783 
784   enum class AggregateDescription {
785     /// When analyzing the value that was inserted into an aggregate, we did
786     /// not manage to find defining `extractvalue` instruction to analyze.
787     NotFound,
788     /// When analyzing the value that was inserted into an aggregate, we did
789     /// manage to find defining `extractvalue` instruction[s], and everything
790     /// matched perfectly - aggregate type, element insertion/extraction index.
791     Found,
792     /// When analyzing the value that was inserted into an aggregate, we did
793     /// manage to find defining `extractvalue` instruction, but there was
794     /// a mismatch: either the source type from which the extraction was didn't
795     /// match the aggregate type into which the insertion was,
796     /// or the extraction/insertion channels mismatched,
797     /// or different elements had different source aggregates.
798     FoundMismatch
799   };
800   auto Describe = [](Optional<Value *> SourceAggregate) {
801     if (SourceAggregate == NotFound)
802       return AggregateDescription::NotFound;
803     if (*SourceAggregate == FoundMismatch)
804       return AggregateDescription::FoundMismatch;
805     return AggregateDescription::Found;
806   };
807 
808   // Given the value \p Elt that was being inserted into element \p EltIdx of an
809   // aggregate AggTy, see if \p Elt was originally defined by an
810   // appropriate extractvalue (same element index, same aggregate type).
811   // If found, return the source aggregate from which the extraction was.
812   // If \p PredBB is provided, does PHI translation of an \p Elt first.
813   auto FindSourceAggregate =
814       [&](Value *Elt, unsigned EltIdx, Optional<BasicBlock *> UseBB,
815           Optional<BasicBlock *> PredBB) -> Optional<Value *> {
816     // For now(?), only deal with, at most, a single level of PHI indirection.
817     if (UseBB && PredBB)
818       Elt = Elt->DoPHITranslation(*UseBB, *PredBB);
819     // FIXME: deal with multiple levels of PHI indirection?
820 
821     // Did we find an extraction?
822     auto *EVI = dyn_cast<ExtractValueInst>(Elt);
823     if (!EVI)
824       return NotFound;
825 
826     Value *SourceAggregate = EVI->getAggregateOperand();
827 
828     // Is the extraction from the same type into which the insertion was?
829     if (SourceAggregate->getType() != AggTy)
830       return FoundMismatch;
831     // And the element index doesn't change between extraction and insertion?
832     if (EVI->getNumIndices() != 1 || EltIdx != EVI->getIndices().front())
833       return FoundMismatch;
834 
835     return SourceAggregate; // AggregateDescription::Found
836   };
837 
838   // Given elements AggElts that were constructing an aggregate OrigIVI,
839   // see if we can find appropriate source aggregate for each of the elements,
840   // and see it's the same aggregate for each element. If so, return it.
841   auto FindCommonSourceAggregate =
842       [&](Optional<BasicBlock *> UseBB,
843           Optional<BasicBlock *> PredBB) -> Optional<Value *> {
844     Optional<Value *> SourceAggregate;
845 
846     for (auto I : enumerate(AggElts)) {
847       assert(Describe(SourceAggregate) != AggregateDescription::FoundMismatch &&
848              "We don't store nullptr in SourceAggregate!");
849       assert((Describe(SourceAggregate) == AggregateDescription::Found) ==
850                  (I.index() != 0) &&
851              "SourceAggregate should be valid after the the first element,");
852 
853       // For this element, is there a plausible source aggregate?
854       // FIXME: we could special-case undef element, IFF we know that in the
855       //        source aggregate said element isn't poison.
856       Optional<Value *> SourceAggregateForElement =
857           FindSourceAggregate(*I.value(), I.index(), UseBB, PredBB);
858 
859       // Okay, what have we found? Does that correlate with previous findings?
860 
861       // Regardless of whether or not we have previously found source
862       // aggregate for previous elements (if any), if we didn't find one for
863       // this element, passthrough whatever we have just found.
864       if (Describe(SourceAggregateForElement) != AggregateDescription::Found)
865         return SourceAggregateForElement;
866 
867       // Okay, we have found source aggregate for this element.
868       // Let's see what we already know from previous elements, if any.
869       switch (Describe(SourceAggregate)) {
870       case AggregateDescription::NotFound:
871         // This is apparently the first element that we have examined.
872         SourceAggregate = SourceAggregateForElement; // Record the aggregate!
873         continue; // Great, now look at next element.
874       case AggregateDescription::Found:
875         // We have previously already successfully examined other elements.
876         // Is this the same source aggregate we've found for other elements?
877         if (*SourceAggregateForElement != *SourceAggregate)
878           return FoundMismatch;
879         continue; // Still the same aggregate, look at next element.
880       case AggregateDescription::FoundMismatch:
881         llvm_unreachable("Can't happen. We would have early-exited then.");
882       };
883     }
884 
885     assert(Describe(SourceAggregate) == AggregateDescription::Found &&
886            "Must be a valid Value");
887     return *SourceAggregate;
888   };
889 
890   Optional<Value *> SourceAggregate;
891 
892   // Can we find the source aggregate without looking at predecessors?
893   SourceAggregate = FindCommonSourceAggregate(/*UseBB=*/None, /*PredBB=*/None);
894   if (Describe(SourceAggregate) != AggregateDescription::NotFound) {
895     if (Describe(SourceAggregate) == AggregateDescription::FoundMismatch)
896       return nullptr; // Conflicting source aggregates!
897     ++NumAggregateReconstructionsSimplified;
898     return replaceInstUsesWith(OrigIVI, *SourceAggregate);
899   }
900 
901   // Okay, apparently we need to look at predecessors.
902 
903   // We should be smart about picking the "use" basic block, which will be the
904   // merge point for aggregate, where we'll insert the final PHI that will be
905   // used instead of OrigIVI. Basic block of OrigIVI is *not* the right choice.
906   // We should look in which blocks each of the AggElts is being defined,
907   // they all should be defined in the same basic block.
908   BasicBlock *UseBB = nullptr;
909 
910   for (const Optional<Value *> &Elt : AggElts) {
911     // If this element's value was not defined by an instruction, ignore it.
912     auto *I = dyn_cast<Instruction>(*Elt);
913     if (!I)
914       continue;
915     // Otherwise, in which basic block is this instruction located?
916     BasicBlock *BB = I->getParent();
917     // If it's the first instruction we've encountered, record the basic block.
918     if (!UseBB) {
919       UseBB = BB;
920       continue;
921     }
922     // Otherwise, this must be the same basic block we've seen previously.
923     if (UseBB != BB)
924       return nullptr;
925   }
926 
927   // If *all* of the elements are basic-block-independent, meaning they are
928   // either function arguments, or constant expressions, then if we didn't
929   // handle them without predecessor-aware handling, we won't handle them now.
930   if (!UseBB)
931     return nullptr;
932 
933   // If we didn't manage to find source aggregate without looking at
934   // predecessors, and there are no predecessors to look at, then we're done.
935   if (pred_empty(UseBB))
936     return nullptr;
937 
938   // Arbitrary predecessor count limit.
939   static const int PredCountLimit = 64;
940 
941   // Cache the (non-uniqified!) list of predecessors in a vector,
942   // checking the limit at the same time for efficiency.
943   SmallVector<BasicBlock *, 4> Preds; // May have duplicates!
944   for (BasicBlock *Pred : predecessors(UseBB)) {
945     // Don't bother if there are too many predecessors.
946     if (Preds.size() >= PredCountLimit) // FIXME: only count duplicates once?
947       return nullptr;
948     Preds.emplace_back(Pred);
949   }
950 
951   // For each predecessor, what is the source aggregate,
952   // from which all the elements were originally extracted from?
953   // Note that we want for the map to have stable iteration order!
954   SmallDenseMap<BasicBlock *, Value *, 4> SourceAggregates;
955   for (BasicBlock *Pred : Preds) {
956     std::pair<decltype(SourceAggregates)::iterator, bool> IV =
957         SourceAggregates.insert({Pred, nullptr});
958     // Did we already evaluate this predecessor?
959     if (!IV.second)
960       continue;
961 
962     // Let's hope that when coming from predecessor Pred, all elements of the
963     // aggregate produced by OrigIVI must have been originally extracted from
964     // the same aggregate. Is that so? Can we find said original aggregate?
965     SourceAggregate = FindCommonSourceAggregate(UseBB, Pred);
966     if (Describe(SourceAggregate) != AggregateDescription::Found)
967       return nullptr; // Give up.
968     IV.first->second = *SourceAggregate;
969   }
970 
971   // All good! Now we just need to thread the source aggregates here.
972   // Note that we have to insert the new PHI here, ourselves, because we can't
973   // rely on InstCombinerImpl::run() inserting it into the right basic block.
974   // Note that the same block can be a predecessor more than once,
975   // and we need to preserve that invariant for the PHI node.
976   BuilderTy::InsertPointGuard Guard(Builder);
977   Builder.SetInsertPoint(UseBB->getFirstNonPHI());
978   auto *PHI =
979       Builder.CreatePHI(AggTy, Preds.size(), OrigIVI.getName() + ".merged");
980   for (BasicBlock *Pred : Preds)
981     PHI->addIncoming(SourceAggregates[Pred], Pred);
982 
983   ++NumAggregateReconstructionsSimplified;
984   return replaceInstUsesWith(OrigIVI, PHI);
985 }
986 
987 /// Try to find redundant insertvalue instructions, like the following ones:
988 ///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0
989 ///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0
990 /// Here the second instruction inserts values at the same indices, as the
991 /// first one, making the first one redundant.
992 /// It should be transformed to:
993 ///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
visitInsertValueInst(InsertValueInst & I)994 Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {
995   bool IsRedundant = false;
996   ArrayRef<unsigned int> FirstIndices = I.getIndices();
997 
998   // If there is a chain of insertvalue instructions (each of them except the
999   // last one has only one use and it's another insertvalue insn from this
1000   // chain), check if any of the 'children' uses the same indices as the first
1001   // instruction. In this case, the first one is redundant.
1002   Value *V = &I;
1003   unsigned Depth = 0;
1004   while (V->hasOneUse() && Depth < 10) {
1005     User *U = V->user_back();
1006     auto UserInsInst = dyn_cast<InsertValueInst>(U);
1007     if (!UserInsInst || U->getOperand(0) != V)
1008       break;
1009     if (UserInsInst->getIndices() == FirstIndices) {
1010       IsRedundant = true;
1011       break;
1012     }
1013     V = UserInsInst;
1014     Depth++;
1015   }
1016 
1017   if (IsRedundant)
1018     return replaceInstUsesWith(I, I.getOperand(0));
1019 
1020   if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I))
1021     return NewI;
1022 
1023   return nullptr;
1024 }
1025 
isShuffleEquivalentToSelect(ShuffleVectorInst & Shuf)1026 static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
1027   // Can not analyze scalable type, the number of elements is not a compile-time
1028   // constant.
1029   if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType()))
1030     return false;
1031 
1032   int MaskSize = Shuf.getShuffleMask().size();
1033   int VecSize =
1034       cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements();
1035 
1036   // A vector select does not change the size of the operands.
1037   if (MaskSize != VecSize)
1038     return false;
1039 
1040   // Each mask element must be undefined or choose a vector element from one of
1041   // the source operands without crossing vector lanes.
1042   for (int i = 0; i != MaskSize; ++i) {
1043     int Elt = Shuf.getMaskValue(i);
1044     if (Elt != -1 && Elt != i && Elt != i + VecSize)
1045       return false;
1046   }
1047 
1048   return true;
1049 }
1050 
1051 /// Turn a chain of inserts that splats a value into an insert + shuffle:
1052 /// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->
1053 /// shufflevector(insertelt(X, %k, 0), undef, zero)
foldInsSequenceIntoSplat(InsertElementInst & InsElt)1054 static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
1055   // We are interested in the last insert in a chain. So if this insert has a
1056   // single user and that user is an insert, bail.
1057   if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
1058     return nullptr;
1059 
1060   VectorType *VecTy = InsElt.getType();
1061   // Can not handle scalable type, the number of elements is not a compile-time
1062   // constant.
1063   if (isa<ScalableVectorType>(VecTy))
1064     return nullptr;
1065   unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
1066 
1067   // Do not try to do this for a one-element vector, since that's a nop,
1068   // and will cause an inf-loop.
1069   if (NumElements == 1)
1070     return nullptr;
1071 
1072   Value *SplatVal = InsElt.getOperand(1);
1073   InsertElementInst *CurrIE = &InsElt;
1074   SmallBitVector ElementPresent(NumElements, false);
1075   InsertElementInst *FirstIE = nullptr;
1076 
1077   // Walk the chain backwards, keeping track of which indices we inserted into,
1078   // until we hit something that isn't an insert of the splatted value.
1079   while (CurrIE) {
1080     auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));
1081     if (!Idx || CurrIE->getOperand(1) != SplatVal)
1082       return nullptr;
1083 
1084     auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));
1085     // Check none of the intermediate steps have any additional uses, except
1086     // for the root insertelement instruction, which can be re-used, if it
1087     // inserts at position 0.
1088     if (CurrIE != &InsElt &&
1089         (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero())))
1090       return nullptr;
1091 
1092     ElementPresent[Idx->getZExtValue()] = true;
1093     FirstIE = CurrIE;
1094     CurrIE = NextIE;
1095   }
1096 
1097   // If this is just a single insertelement (not a sequence), we are done.
1098   if (FirstIE == &InsElt)
1099     return nullptr;
1100 
1101   // If we are not inserting into an undef vector, make sure we've seen an
1102   // insert into every element.
1103   // TODO: If the base vector is not undef, it might be better to create a splat
1104   //       and then a select-shuffle (blend) with the base vector.
1105   if (!isa<UndefValue>(FirstIE->getOperand(0)))
1106     if (!ElementPresent.all())
1107       return nullptr;
1108 
1109   // Create the insert + shuffle.
1110   Type *Int32Ty = Type::getInt32Ty(InsElt.getContext());
1111   UndefValue *UndefVec = UndefValue::get(VecTy);
1112   Constant *Zero = ConstantInt::get(Int32Ty, 0);
1113   if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero())
1114     FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt);
1115 
1116   // Splat from element 0, but replace absent elements with undef in the mask.
1117   SmallVector<int, 16> Mask(NumElements, 0);
1118   for (unsigned i = 0; i != NumElements; ++i)
1119     if (!ElementPresent[i])
1120       Mask[i] = -1;
1121 
1122   return new ShuffleVectorInst(FirstIE, UndefVec, Mask);
1123 }
1124 
1125 /// Try to fold an insert element into an existing splat shuffle by changing
1126 /// the shuffle's mask to include the index of this insert element.
foldInsEltIntoSplat(InsertElementInst & InsElt)1127 static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
1128   // Check if the vector operand of this insert is a canonical splat shuffle.
1129   auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
1130   if (!Shuf || !Shuf->isZeroEltSplat())
1131     return nullptr;
1132 
1133   // Bail out early if shuffle is scalable type. The number of elements in
1134   // shuffle mask is unknown at compile-time.
1135   if (isa<ScalableVectorType>(Shuf->getType()))
1136     return nullptr;
1137 
1138   // Check for a constant insertion index.
1139   uint64_t IdxC;
1140   if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
1141     return nullptr;
1142 
1143   // Check if the splat shuffle's input is the same as this insert's scalar op.
1144   Value *X = InsElt.getOperand(1);
1145   Value *Op0 = Shuf->getOperand(0);
1146   if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt())))
1147     return nullptr;
1148 
1149   // Replace the shuffle mask element at the index of this insert with a zero.
1150   // For example:
1151   // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
1152   //   --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
1153   unsigned NumMaskElts =
1154       cast<FixedVectorType>(Shuf->getType())->getNumElements();
1155   SmallVector<int, 16> NewMask(NumMaskElts);
1156   for (unsigned i = 0; i != NumMaskElts; ++i)
1157     NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
1158 
1159   return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
1160 }
1161 
1162 /// Try to fold an extract+insert element into an existing identity shuffle by
1163 /// changing the shuffle's mask to include the index of this insert element.
foldInsEltIntoIdentityShuffle(InsertElementInst & InsElt)1164 static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
1165   // Check if the vector operand of this insert is an identity shuffle.
1166   auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
1167   if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) ||
1168       !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
1169     return nullptr;
1170 
1171   // Bail out early if shuffle is scalable type. The number of elements in
1172   // shuffle mask is unknown at compile-time.
1173   if (isa<ScalableVectorType>(Shuf->getType()))
1174     return nullptr;
1175 
1176   // Check for a constant insertion index.
1177   uint64_t IdxC;
1178   if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
1179     return nullptr;
1180 
1181   // Check if this insert's scalar op is extracted from the identity shuffle's
1182   // input vector.
1183   Value *Scalar = InsElt.getOperand(1);
1184   Value *X = Shuf->getOperand(0);
1185   if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC))))
1186     return nullptr;
1187 
1188   // Replace the shuffle mask element at the index of this extract+insert with
1189   // that same index value.
1190   // For example:
1191   // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
1192   unsigned NumMaskElts =
1193       cast<FixedVectorType>(Shuf->getType())->getNumElements();
1194   SmallVector<int, 16> NewMask(NumMaskElts);
1195   ArrayRef<int> OldMask = Shuf->getShuffleMask();
1196   for (unsigned i = 0; i != NumMaskElts; ++i) {
1197     if (i != IdxC) {
1198       // All mask elements besides the inserted element remain the same.
1199       NewMask[i] = OldMask[i];
1200     } else if (OldMask[i] == (int)IdxC) {
1201       // If the mask element was already set, there's nothing to do
1202       // (demanded elements analysis may unset it later).
1203       return nullptr;
1204     } else {
1205       assert(OldMask[i] == UndefMaskElem &&
1206              "Unexpected shuffle mask element for identity shuffle");
1207       NewMask[i] = IdxC;
1208     }
1209   }
1210 
1211   return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
1212 }
1213 
1214 /// If we have an insertelement instruction feeding into another insertelement
1215 /// and the 2nd is inserting a constant into the vector, canonicalize that
1216 /// constant insertion before the insertion of a variable:
1217 ///
1218 /// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
1219 /// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
1220 ///
1221 /// This has the potential of eliminating the 2nd insertelement instruction
1222 /// via constant folding of the scalar constant into a vector constant.
hoistInsEltConst(InsertElementInst & InsElt2,InstCombiner::BuilderTy & Builder)1223 static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
1224                                      InstCombiner::BuilderTy &Builder) {
1225   auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
1226   if (!InsElt1 || !InsElt1->hasOneUse())
1227     return nullptr;
1228 
1229   Value *X, *Y;
1230   Constant *ScalarC;
1231   ConstantInt *IdxC1, *IdxC2;
1232   if (match(InsElt1->getOperand(0), m_Value(X)) &&
1233       match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
1234       match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
1235       match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
1236       match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
1237     Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
1238     return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
1239   }
1240 
1241   return nullptr;
1242 }
1243 
1244 /// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
1245 /// --> shufflevector X, CVec', Mask'
foldConstantInsEltIntoShuffle(InsertElementInst & InsElt)1246 static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
1247   auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));
1248   // Bail out if the parent has more than one use. In that case, we'd be
1249   // replacing the insertelt with a shuffle, and that's not a clear win.
1250   if (!Inst || !Inst->hasOneUse())
1251     return nullptr;
1252   if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {
1253     // The shuffle must have a constant vector operand. The insertelt must have
1254     // a constant scalar being inserted at a constant position in the vector.
1255     Constant *ShufConstVec, *InsEltScalar;
1256     uint64_t InsEltIndex;
1257     if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||
1258         !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||
1259         !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))
1260       return nullptr;
1261 
1262     // Adding an element to an arbitrary shuffle could be expensive, but a
1263     // shuffle that selects elements from vectors without crossing lanes is
1264     // assumed cheap.
1265     // If we're just adding a constant into that shuffle, it will still be
1266     // cheap.
1267     if (!isShuffleEquivalentToSelect(*Shuf))
1268       return nullptr;
1269 
1270     // From the above 'select' check, we know that the mask has the same number
1271     // of elements as the vector input operands. We also know that each constant
1272     // input element is used in its lane and can not be used more than once by
1273     // the shuffle. Therefore, replace the constant in the shuffle's constant
1274     // vector with the insertelt constant. Replace the constant in the shuffle's
1275     // mask vector with the insertelt index plus the length of the vector
1276     // (because the constant vector operand of a shuffle is always the 2nd
1277     // operand).
1278     ArrayRef<int> Mask = Shuf->getShuffleMask();
1279     unsigned NumElts = Mask.size();
1280     SmallVector<Constant *, 16> NewShufElts(NumElts);
1281     SmallVector<int, 16> NewMaskElts(NumElts);
1282     for (unsigned I = 0; I != NumElts; ++I) {
1283       if (I == InsEltIndex) {
1284         NewShufElts[I] = InsEltScalar;
1285         NewMaskElts[I] = InsEltIndex + NumElts;
1286       } else {
1287         // Copy over the existing values.
1288         NewShufElts[I] = ShufConstVec->getAggregateElement(I);
1289         NewMaskElts[I] = Mask[I];
1290       }
1291     }
1292 
1293     // Create new operands for a shuffle that includes the constant of the
1294     // original insertelt. The old shuffle will be dead now.
1295     return new ShuffleVectorInst(Shuf->getOperand(0),
1296                                  ConstantVector::get(NewShufElts), NewMaskElts);
1297   } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
1298     // Transform sequences of insertelements ops with constant data/indexes into
1299     // a single shuffle op.
1300     // Can not handle scalable type, the number of elements needed to create
1301     // shuffle mask is not a compile-time constant.
1302     if (isa<ScalableVectorType>(InsElt.getType()))
1303       return nullptr;
1304     unsigned NumElts =
1305         cast<FixedVectorType>(InsElt.getType())->getNumElements();
1306 
1307     uint64_t InsertIdx[2];
1308     Constant *Val[2];
1309     if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||
1310         !match(InsElt.getOperand(1), m_Constant(Val[0])) ||
1311         !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||
1312         !match(IEI->getOperand(1), m_Constant(Val[1])))
1313       return nullptr;
1314     SmallVector<Constant *, 16> Values(NumElts);
1315     SmallVector<int, 16> Mask(NumElts);
1316     auto ValI = std::begin(Val);
1317     // Generate new constant vector and mask.
1318     // We have 2 values/masks from the insertelements instructions. Insert them
1319     // into new value/mask vectors.
1320     for (uint64_t I : InsertIdx) {
1321       if (!Values[I]) {
1322         Values[I] = *ValI;
1323         Mask[I] = NumElts + I;
1324       }
1325       ++ValI;
1326     }
1327     // Remaining values are filled with 'undef' values.
1328     for (unsigned I = 0; I < NumElts; ++I) {
1329       if (!Values[I]) {
1330         Values[I] = UndefValue::get(InsElt.getType()->getElementType());
1331         Mask[I] = I;
1332       }
1333     }
1334     // Create new operands for a shuffle that includes the constant of the
1335     // original insertelt.
1336     return new ShuffleVectorInst(IEI->getOperand(0),
1337                                  ConstantVector::get(Values), Mask);
1338   }
1339   return nullptr;
1340 }
1341 
visitInsertElementInst(InsertElementInst & IE)1342 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
1343   Value *VecOp    = IE.getOperand(0);
1344   Value *ScalarOp = IE.getOperand(1);
1345   Value *IdxOp    = IE.getOperand(2);
1346 
1347   if (auto *V = SimplifyInsertElementInst(
1348           VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
1349     return replaceInstUsesWith(IE, V);
1350 
1351   // If the scalar is bitcast and inserted into undef, do the insert in the
1352   // source type followed by bitcast.
1353   // TODO: Generalize for insert into any constant, not just undef?
1354   Value *ScalarSrc;
1355   if (match(VecOp, m_Undef()) &&
1356       match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) &&
1357       (ScalarSrc->getType()->isIntegerTy() ||
1358        ScalarSrc->getType()->isFloatingPointTy())) {
1359     // inselt undef, (bitcast ScalarSrc), IdxOp -->
1360     //   bitcast (inselt undef, ScalarSrc, IdxOp)
1361     Type *ScalarTy = ScalarSrc->getType();
1362     Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount());
1363     UndefValue *NewUndef = UndefValue::get(VecTy);
1364     Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp);
1365     return new BitCastInst(NewInsElt, IE.getType());
1366   }
1367 
1368   // If the vector and scalar are both bitcast from the same element type, do
1369   // the insert in that source type followed by bitcast.
1370   Value *VecSrc;
1371   if (match(VecOp, m_BitCast(m_Value(VecSrc))) &&
1372       match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) &&
1373       (VecOp->hasOneUse() || ScalarOp->hasOneUse()) &&
1374       VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() &&
1375       cast<VectorType>(VecSrc->getType())->getElementType() ==
1376           ScalarSrc->getType()) {
1377     // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp -->
1378     //   bitcast (inselt VecSrc, ScalarSrc, IdxOp)
1379     Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp);
1380     return new BitCastInst(NewInsElt, IE.getType());
1381   }
1382 
1383   // If the inserted element was extracted from some other fixed-length vector
1384   // and both indexes are valid constants, try to turn this into a shuffle.
1385   // Can not handle scalable vector type, the number of elements needed to
1386   // create shuffle mask is not a compile-time constant.
1387   uint64_t InsertedIdx, ExtractedIdx;
1388   Value *ExtVecOp;
1389   if (isa<FixedVectorType>(IE.getType()) &&
1390       match(IdxOp, m_ConstantInt(InsertedIdx)) &&
1391       match(ScalarOp,
1392             m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) &&
1393       isa<FixedVectorType>(ExtVecOp->getType()) &&
1394       ExtractedIdx <
1395           cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) {
1396     // TODO: Looking at the user(s) to determine if this insert is a
1397     // fold-to-shuffle opportunity does not match the usual instcombine
1398     // constraints. We should decide if the transform is worthy based only
1399     // on this instruction and its operands, but that may not work currently.
1400     //
1401     // Here, we are trying to avoid creating shuffles before reaching
1402     // the end of a chain of extract-insert pairs. This is complicated because
1403     // we do not generally form arbitrary shuffle masks in instcombine
1404     // (because those may codegen poorly), but collectShuffleElements() does
1405     // exactly that.
1406     //
1407     // The rules for determining what is an acceptable target-independent
1408     // shuffle mask are fuzzy because they evolve based on the backend's
1409     // capabilities and real-world impact.
1410     auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
1411       if (!Insert.hasOneUse())
1412         return true;
1413       auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
1414       if (!InsertUser)
1415         return true;
1416       return false;
1417     };
1418 
1419     // Try to form a shuffle from a chain of extract-insert ops.
1420     if (isShuffleRootCandidate(IE)) {
1421       SmallVector<int, 16> Mask;
1422       ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
1423 
1424       // The proposed shuffle may be trivial, in which case we shouldn't
1425       // perform the combine.
1426       if (LR.first != &IE && LR.second != &IE) {
1427         // We now have a shuffle of LHS, RHS, Mask.
1428         if (LR.second == nullptr)
1429           LR.second = UndefValue::get(LR.first->getType());
1430         return new ShuffleVectorInst(LR.first, LR.second, Mask);
1431       }
1432     }
1433   }
1434 
1435   if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {
1436     unsigned VWidth = VecTy->getNumElements();
1437     APInt UndefElts(VWidth, 0);
1438     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
1439     if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
1440       if (V != &IE)
1441         return replaceInstUsesWith(IE, V);
1442       return &IE;
1443     }
1444   }
1445 
1446   if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
1447     return Shuf;
1448 
1449   if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))
1450     return NewInsElt;
1451 
1452   if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE))
1453     return Broadcast;
1454 
1455   if (Instruction *Splat = foldInsEltIntoSplat(IE))
1456     return Splat;
1457 
1458   if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
1459     return IdentityShuf;
1460 
1461   return nullptr;
1462 }
1463 
1464 /// Return true if we can evaluate the specified expression tree if the vector
1465 /// elements were shuffled in a different order.
canEvaluateShuffled(Value * V,ArrayRef<int> Mask,unsigned Depth=5)1466 static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
1467                                 unsigned Depth = 5) {
1468   // We can always reorder the elements of a constant.
1469   if (isa<Constant>(V))
1470     return true;
1471 
1472   // We won't reorder vector arguments. No IPO here.
1473   Instruction *I = dyn_cast<Instruction>(V);
1474   if (!I) return false;
1475 
1476   // Two users may expect different orders of the elements. Don't try it.
1477   if (!I->hasOneUse())
1478     return false;
1479 
1480   if (Depth == 0) return false;
1481 
1482   switch (I->getOpcode()) {
1483     case Instruction::UDiv:
1484     case Instruction::SDiv:
1485     case Instruction::URem:
1486     case Instruction::SRem:
1487       // Propagating an undefined shuffle mask element to integer div/rem is not
1488       // allowed because those opcodes can create immediate undefined behavior
1489       // from an undefined element in an operand.
1490       if (llvm::is_contained(Mask, -1))
1491         return false;
1492       LLVM_FALLTHROUGH;
1493     case Instruction::Add:
1494     case Instruction::FAdd:
1495     case Instruction::Sub:
1496     case Instruction::FSub:
1497     case Instruction::Mul:
1498     case Instruction::FMul:
1499     case Instruction::FDiv:
1500     case Instruction::FRem:
1501     case Instruction::Shl:
1502     case Instruction::LShr:
1503     case Instruction::AShr:
1504     case Instruction::And:
1505     case Instruction::Or:
1506     case Instruction::Xor:
1507     case Instruction::ICmp:
1508     case Instruction::FCmp:
1509     case Instruction::Trunc:
1510     case Instruction::ZExt:
1511     case Instruction::SExt:
1512     case Instruction::FPToUI:
1513     case Instruction::FPToSI:
1514     case Instruction::UIToFP:
1515     case Instruction::SIToFP:
1516     case Instruction::FPTrunc:
1517     case Instruction::FPExt:
1518     case Instruction::GetElementPtr: {
1519       // Bail out if we would create longer vector ops. We could allow creating
1520       // longer vector ops, but that may result in more expensive codegen.
1521       Type *ITy = I->getType();
1522       if (ITy->isVectorTy() &&
1523           Mask.size() > cast<FixedVectorType>(ITy)->getNumElements())
1524         return false;
1525       for (Value *Operand : I->operands()) {
1526         if (!canEvaluateShuffled(Operand, Mask, Depth - 1))
1527           return false;
1528       }
1529       return true;
1530     }
1531     case Instruction::InsertElement: {
1532       ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
1533       if (!CI) return false;
1534       int ElementNumber = CI->getLimitedValue();
1535 
1536       // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
1537       // can't put an element into multiple indices.
1538       bool SeenOnce = false;
1539       for (int i = 0, e = Mask.size(); i != e; ++i) {
1540         if (Mask[i] == ElementNumber) {
1541           if (SeenOnce)
1542             return false;
1543           SeenOnce = true;
1544         }
1545       }
1546       return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1);
1547     }
1548   }
1549   return false;
1550 }
1551 
1552 /// Rebuild a new instruction just like 'I' but with the new operands given.
1553 /// In the event of type mismatch, the type of the operands is correct.
buildNew(Instruction * I,ArrayRef<Value * > NewOps)1554 static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
1555   // We don't want to use the IRBuilder here because we want the replacement
1556   // instructions to appear next to 'I', not the builder's insertion point.
1557   switch (I->getOpcode()) {
1558     case Instruction::Add:
1559     case Instruction::FAdd:
1560     case Instruction::Sub:
1561     case Instruction::FSub:
1562     case Instruction::Mul:
1563     case Instruction::FMul:
1564     case Instruction::UDiv:
1565     case Instruction::SDiv:
1566     case Instruction::FDiv:
1567     case Instruction::URem:
1568     case Instruction::SRem:
1569     case Instruction::FRem:
1570     case Instruction::Shl:
1571     case Instruction::LShr:
1572     case Instruction::AShr:
1573     case Instruction::And:
1574     case Instruction::Or:
1575     case Instruction::Xor: {
1576       BinaryOperator *BO = cast<BinaryOperator>(I);
1577       assert(NewOps.size() == 2 && "binary operator with #ops != 2");
1578       BinaryOperator *New =
1579           BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
1580                                  NewOps[0], NewOps[1], "", BO);
1581       if (isa<OverflowingBinaryOperator>(BO)) {
1582         New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
1583         New->setHasNoSignedWrap(BO->hasNoSignedWrap());
1584       }
1585       if (isa<PossiblyExactOperator>(BO)) {
1586         New->setIsExact(BO->isExact());
1587       }
1588       if (isa<FPMathOperator>(BO))
1589         New->copyFastMathFlags(I);
1590       return New;
1591     }
1592     case Instruction::ICmp:
1593       assert(NewOps.size() == 2 && "icmp with #ops != 2");
1594       return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
1595                           NewOps[0], NewOps[1]);
1596     case Instruction::FCmp:
1597       assert(NewOps.size() == 2 && "fcmp with #ops != 2");
1598       return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
1599                           NewOps[0], NewOps[1]);
1600     case Instruction::Trunc:
1601     case Instruction::ZExt:
1602     case Instruction::SExt:
1603     case Instruction::FPToUI:
1604     case Instruction::FPToSI:
1605     case Instruction::UIToFP:
1606     case Instruction::SIToFP:
1607     case Instruction::FPTrunc:
1608     case Instruction::FPExt: {
1609       // It's possible that the mask has a different number of elements from
1610       // the original cast. We recompute the destination type to match the mask.
1611       Type *DestTy = VectorType::get(
1612           I->getType()->getScalarType(),
1613           cast<VectorType>(NewOps[0]->getType())->getElementCount());
1614       assert(NewOps.size() == 1 && "cast with #ops != 1");
1615       return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
1616                               "", I);
1617     }
1618     case Instruction::GetElementPtr: {
1619       Value *Ptr = NewOps[0];
1620       ArrayRef<Value*> Idx = NewOps.slice(1);
1621       GetElementPtrInst *GEP = GetElementPtrInst::Create(
1622           cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
1623       GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
1624       return GEP;
1625     }
1626   }
1627   llvm_unreachable("failed to rebuild vector instructions");
1628 }
1629 
evaluateInDifferentElementOrder(Value * V,ArrayRef<int> Mask)1630 static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
1631   // Mask.size() does not need to be equal to the number of vector elements.
1632 
1633   assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
1634   Type *EltTy = V->getType()->getScalarType();
1635   Type *I32Ty = IntegerType::getInt32Ty(V->getContext());
1636   if (isa<UndefValue>(V))
1637     return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));
1638 
1639   if (isa<ConstantAggregateZero>(V))
1640     return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));
1641 
1642   if (Constant *C = dyn_cast<Constant>(V))
1643     return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
1644                                           Mask);
1645 
1646   Instruction *I = cast<Instruction>(V);
1647   switch (I->getOpcode()) {
1648     case Instruction::Add:
1649     case Instruction::FAdd:
1650     case Instruction::Sub:
1651     case Instruction::FSub:
1652     case Instruction::Mul:
1653     case Instruction::FMul:
1654     case Instruction::UDiv:
1655     case Instruction::SDiv:
1656     case Instruction::FDiv:
1657     case Instruction::URem:
1658     case Instruction::SRem:
1659     case Instruction::FRem:
1660     case Instruction::Shl:
1661     case Instruction::LShr:
1662     case Instruction::AShr:
1663     case Instruction::And:
1664     case Instruction::Or:
1665     case Instruction::Xor:
1666     case Instruction::ICmp:
1667     case Instruction::FCmp:
1668     case Instruction::Trunc:
1669     case Instruction::ZExt:
1670     case Instruction::SExt:
1671     case Instruction::FPToUI:
1672     case Instruction::FPToSI:
1673     case Instruction::UIToFP:
1674     case Instruction::SIToFP:
1675     case Instruction::FPTrunc:
1676     case Instruction::FPExt:
1677     case Instruction::Select:
1678     case Instruction::GetElementPtr: {
1679       SmallVector<Value*, 8> NewOps;
1680       bool NeedsRebuild =
1681           (Mask.size() !=
1682            cast<FixedVectorType>(I->getType())->getNumElements());
1683       for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
1684         Value *V;
1685         // Recursively call evaluateInDifferentElementOrder on vector arguments
1686         // as well. E.g. GetElementPtr may have scalar operands even if the
1687         // return value is a vector, so we need to examine the operand type.
1688         if (I->getOperand(i)->getType()->isVectorTy())
1689           V = evaluateInDifferentElementOrder(I->getOperand(i), Mask);
1690         else
1691           V = I->getOperand(i);
1692         NewOps.push_back(V);
1693         NeedsRebuild |= (V != I->getOperand(i));
1694       }
1695       if (NeedsRebuild) {
1696         return buildNew(I, NewOps);
1697       }
1698       return I;
1699     }
1700     case Instruction::InsertElement: {
1701       int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
1702 
1703       // The insertelement was inserting at Element. Figure out which element
1704       // that becomes after shuffling. The answer is guaranteed to be unique
1705       // by CanEvaluateShuffled.
1706       bool Found = false;
1707       int Index = 0;
1708       for (int e = Mask.size(); Index != e; ++Index) {
1709         if (Mask[Index] == Element) {
1710           Found = true;
1711           break;
1712         }
1713       }
1714 
1715       // If element is not in Mask, no need to handle the operand 1 (element to
1716       // be inserted). Just evaluate values in operand 0 according to Mask.
1717       if (!Found)
1718         return evaluateInDifferentElementOrder(I->getOperand(0), Mask);
1719 
1720       Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask);
1721       return InsertElementInst::Create(V, I->getOperand(1),
1722                                        ConstantInt::get(I32Ty, Index), "", I);
1723     }
1724   }
1725   llvm_unreachable("failed to reorder elements of vector instruction!");
1726 }
1727 
1728 // Returns true if the shuffle is extracting a contiguous range of values from
1729 // LHS, for example:
1730 //                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
1731 //   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
1732 //   Shuffles to:  |EE|FF|GG|HH|
1733 //                 +--+--+--+--+
isShuffleExtractingFromLHS(ShuffleVectorInst & SVI,ArrayRef<int> Mask)1734 static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
1735                                        ArrayRef<int> Mask) {
1736   unsigned LHSElems =
1737       cast<FixedVectorType>(SVI.getOperand(0)->getType())->getNumElements();
1738   unsigned MaskElems = Mask.size();
1739   unsigned BegIdx = Mask.front();
1740   unsigned EndIdx = Mask.back();
1741   if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
1742     return false;
1743   for (unsigned I = 0; I != MaskElems; ++I)
1744     if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
1745       return false;
1746   return true;
1747 }
1748 
1749 /// These are the ingredients in an alternate form binary operator as described
1750 /// below.
1751 struct BinopElts {
1752   BinaryOperator::BinaryOps Opcode;
1753   Value *Op0;
1754   Value *Op1;
BinopEltsBinopElts1755   BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,
1756             Value *V0 = nullptr, Value *V1 = nullptr) :
1757       Opcode(Opc), Op0(V0), Op1(V1) {}
operator boolBinopElts1758   operator bool() const { return Opcode != 0; }
1759 };
1760 
1761 /// Binops may be transformed into binops with different opcodes and operands.
1762 /// Reverse the usual canonicalization to enable folds with the non-canonical
1763 /// form of the binop. If a transform is possible, return the elements of the
1764 /// new binop. If not, return invalid elements.
getAlternateBinop(BinaryOperator * BO,const DataLayout & DL)1765 static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
1766   Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
1767   Type *Ty = BO->getType();
1768   switch (BO->getOpcode()) {
1769     case Instruction::Shl: {
1770       // shl X, C --> mul X, (1 << C)
1771       Constant *C;
1772       if (match(BO1, m_Constant(C))) {
1773         Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
1774         return { Instruction::Mul, BO0, ShlOne };
1775       }
1776       break;
1777     }
1778     case Instruction::Or: {
1779       // or X, C --> add X, C (when X and C have no common bits set)
1780       const APInt *C;
1781       if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
1782         return { Instruction::Add, BO0, BO1 };
1783       break;
1784     }
1785     default:
1786       break;
1787   }
1788   return {};
1789 }
1790 
foldSelectShuffleWith1Binop(ShuffleVectorInst & Shuf)1791 static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
1792   assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
1793 
1794   // Are we shuffling together some value and that same value after it has been
1795   // modified by a binop with a constant?
1796   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
1797   Constant *C;
1798   bool Op0IsBinop;
1799   if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))
1800     Op0IsBinop = true;
1801   else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))
1802     Op0IsBinop = false;
1803   else
1804     return nullptr;
1805 
1806   // The identity constant for a binop leaves a variable operand unchanged. For
1807   // a vector, this is a splat of something like 0, -1, or 1.
1808   // If there's no identity constant for this binop, we're done.
1809   auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);
1810   BinaryOperator::BinaryOps BOpcode = BO->getOpcode();
1811   Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);
1812   if (!IdC)
1813     return nullptr;
1814 
1815   // Shuffle identity constants into the lanes that return the original value.
1816   // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
1817   // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
1818   // The existing binop constant vector remains in the same operand position.
1819   ArrayRef<int> Mask = Shuf.getShuffleMask();
1820   Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
1821                                 ConstantExpr::getShuffleVector(IdC, C, Mask);
1822 
1823   bool MightCreatePoisonOrUB =
1824       is_contained(Mask, UndefMaskElem) &&
1825       (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
1826   if (MightCreatePoisonOrUB)
1827     NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);
1828 
1829   // shuf (bop X, C), X, M --> bop X, C'
1830   // shuf X, (bop X, C), M --> bop X, C'
1831   Value *X = Op0IsBinop ? Op1 : Op0;
1832   Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
1833   NewBO->copyIRFlags(BO);
1834 
1835   // An undef shuffle mask element may propagate as an undef constant element in
1836   // the new binop. That would produce poison where the original code might not.
1837   // If we already made a safe constant, then there's no danger.
1838   if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
1839     NewBO->dropPoisonGeneratingFlags();
1840   return NewBO;
1841 }
1842 
1843 /// If we have an insert of a scalar to a non-zero element of an undefined
1844 /// vector and then shuffle that value, that's the same as inserting to the zero
1845 /// element and shuffling. Splatting from the zero element is recognized as the
1846 /// canonical form of splat.
canonicalizeInsertSplat(ShuffleVectorInst & Shuf,InstCombiner::BuilderTy & Builder)1847 static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
1848                                             InstCombiner::BuilderTy &Builder) {
1849   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
1850   ArrayRef<int> Mask = Shuf.getShuffleMask();
1851   Value *X;
1852   uint64_t IndexC;
1853 
1854   // Match a shuffle that is a splat to a non-zero element.
1855   if (!match(Op0, m_OneUse(m_InsertElt(m_Undef(), m_Value(X),
1856                                        m_ConstantInt(IndexC)))) ||
1857       !match(Op1, m_Undef()) || match(Mask, m_ZeroMask()) || IndexC == 0)
1858     return nullptr;
1859 
1860   // Insert into element 0 of an undef vector.
1861   UndefValue *UndefVec = UndefValue::get(Shuf.getType());
1862   Constant *Zero = Builder.getInt32(0);
1863   Value *NewIns = Builder.CreateInsertElement(UndefVec, X, Zero);
1864 
1865   // Splat from element 0. Any mask element that is undefined remains undefined.
1866   // For example:
1867   // shuf (inselt undef, X, 2), undef, <2,2,undef>
1868   //   --> shuf (inselt undef, X, 0), undef, <0,0,undef>
1869   unsigned NumMaskElts =
1870       cast<FixedVectorType>(Shuf.getType())->getNumElements();
1871   SmallVector<int, 16> NewMask(NumMaskElts, 0);
1872   for (unsigned i = 0; i != NumMaskElts; ++i)
1873     if (Mask[i] == UndefMaskElem)
1874       NewMask[i] = Mask[i];
1875 
1876   return new ShuffleVectorInst(NewIns, UndefVec, NewMask);
1877 }
1878 
1879 /// Try to fold shuffles that are the equivalent of a vector select.
foldSelectShuffle(ShuffleVectorInst & Shuf,InstCombiner::BuilderTy & Builder,const DataLayout & DL)1880 static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
1881                                       InstCombiner::BuilderTy &Builder,
1882                                       const DataLayout &DL) {
1883   if (!Shuf.isSelect())
1884     return nullptr;
1885 
1886   // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
1887   // Commuting undef to operand 0 conflicts with another canonicalization.
1888   unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
1889   if (!isa<UndefValue>(Shuf.getOperand(1)) &&
1890       Shuf.getMaskValue(0) >= (int)NumElts) {
1891     // TODO: Can we assert that both operands of a shuffle-select are not undef
1892     // (otherwise, it would have been folded by instsimplify?
1893     Shuf.commute();
1894     return &Shuf;
1895   }
1896 
1897   if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
1898     return I;
1899 
1900   BinaryOperator *B0, *B1;
1901   if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||
1902       !match(Shuf.getOperand(1), m_BinOp(B1)))
1903     return nullptr;
1904 
1905   Value *X, *Y;
1906   Constant *C0, *C1;
1907   bool ConstantsAreOp1;
1908   if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
1909       match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
1910     ConstantsAreOp1 = true;
1911   else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
1912            match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
1913     ConstantsAreOp1 = false;
1914   else
1915     return nullptr;
1916 
1917   // We need matching binops to fold the lanes together.
1918   BinaryOperator::BinaryOps Opc0 = B0->getOpcode();
1919   BinaryOperator::BinaryOps Opc1 = B1->getOpcode();
1920   bool DropNSW = false;
1921   if (ConstantsAreOp1 && Opc0 != Opc1) {
1922     // TODO: We drop "nsw" if shift is converted into multiply because it may
1923     // not be correct when the shift amount is BitWidth - 1. We could examine
1924     // each vector element to determine if it is safe to keep that flag.
1925     if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)
1926       DropNSW = true;
1927     if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {
1928       assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");
1929       Opc0 = AltB0.Opcode;
1930       C0 = cast<Constant>(AltB0.Op1);
1931     } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {
1932       assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");
1933       Opc1 = AltB1.Opcode;
1934       C1 = cast<Constant>(AltB1.Op1);
1935     }
1936   }
1937 
1938   if (Opc0 != Opc1)
1939     return nullptr;
1940 
1941   // The opcodes must be the same. Use a new name to make that clear.
1942   BinaryOperator::BinaryOps BOpc = Opc0;
1943 
1944   // Select the constant elements needed for the single binop.
1945   ArrayRef<int> Mask = Shuf.getShuffleMask();
1946   Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
1947 
1948   // We are moving a binop after a shuffle. When a shuffle has an undefined
1949   // mask element, the result is undefined, but it is not poison or undefined
1950   // behavior. That is not necessarily true for div/rem/shift.
1951   bool MightCreatePoisonOrUB =
1952       is_contained(Mask, UndefMaskElem) &&
1953       (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
1954   if (MightCreatePoisonOrUB)
1955     NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,
1956                                                        ConstantsAreOp1);
1957 
1958   Value *V;
1959   if (X == Y) {
1960     // Remove a binop and the shuffle by rearranging the constant:
1961     // shuffle (op V, C0), (op V, C1), M --> op V, C'
1962     // shuffle (op C0, V), (op C1, V), M --> op C', V
1963     V = X;
1964   } else {
1965     // If there are 2 different variable operands, we must create a new shuffle
1966     // (select) first, so check uses to ensure that we don't end up with more
1967     // instructions than we started with.
1968     if (!B0->hasOneUse() && !B1->hasOneUse())
1969       return nullptr;
1970 
1971     // If we use the original shuffle mask and op1 is *variable*, we would be
1972     // putting an undef into operand 1 of div/rem/shift. This is either UB or
1973     // poison. We do not have to guard against UB when *constants* are op1
1974     // because safe constants guarantee that we do not overflow sdiv/srem (and
1975     // there's no danger for other opcodes).
1976     // TODO: To allow this case, create a new shuffle mask with no undefs.
1977     if (MightCreatePoisonOrUB && !ConstantsAreOp1)
1978       return nullptr;
1979 
1980     // Note: In general, we do not create new shuffles in InstCombine because we
1981     // do not know if a target can lower an arbitrary shuffle optimally. In this
1982     // case, the shuffle uses the existing mask, so there is no additional risk.
1983 
1984     // Select the variable vectors first, then perform the binop:
1985     // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'
1986     // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)
1987     V = Builder.CreateShuffleVector(X, Y, Mask);
1988   }
1989 
1990   Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
1991                                          BinaryOperator::Create(BOpc, NewC, V);
1992 
1993   // Flags are intersected from the 2 source binops. But there are 2 exceptions:
1994   // 1. If we changed an opcode, poison conditions might have changed.
1995   // 2. If the shuffle had undef mask elements, the new binop might have undefs
1996   //    where the original code did not. But if we already made a safe constant,
1997   //    then there's no danger.
1998   NewBO->copyIRFlags(B0);
1999   NewBO->andIRFlags(B1);
2000   if (DropNSW)
2001     NewBO->setHasNoSignedWrap(false);
2002   if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
2003     NewBO->dropPoisonGeneratingFlags();
2004   return NewBO;
2005 }
2006 
2007 /// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
2008 /// Example (little endian):
2009 /// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>
foldTruncShuffle(ShuffleVectorInst & Shuf,bool IsBigEndian)2010 static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
2011                                      bool IsBigEndian) {
2012   // This must be a bitcasted shuffle of 1 vector integer operand.
2013   Type *DestType = Shuf.getType();
2014   Value *X;
2015   if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||
2016       !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy())
2017     return nullptr;
2018 
2019   // The source type must have the same number of elements as the shuffle,
2020   // and the source element type must be larger than the shuffle element type.
2021   Type *SrcType = X->getType();
2022   if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
2023       cast<FixedVectorType>(SrcType)->getNumElements() !=
2024           cast<FixedVectorType>(DestType)->getNumElements() ||
2025       SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
2026     return nullptr;
2027 
2028   assert(Shuf.changesLength() && !Shuf.increasesLength() &&
2029          "Expected a shuffle that decreases length");
2030 
2031   // Last, check that the mask chooses the correct low bits for each narrow
2032   // element in the result.
2033   uint64_t TruncRatio =
2034       SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();
2035   ArrayRef<int> Mask = Shuf.getShuffleMask();
2036   for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
2037     if (Mask[i] == UndefMaskElem)
2038       continue;
2039     uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
2040     assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits");
2041     if (Mask[i] != (int)LSBIndex)
2042       return nullptr;
2043   }
2044 
2045   return new TruncInst(X, DestType);
2046 }
2047 
2048 /// Match a shuffle-select-shuffle pattern where the shuffles are widening and
2049 /// narrowing (concatenating with undef and extracting back to the original
2050 /// length). This allows replacing the wide select with a narrow select.
narrowVectorSelect(ShuffleVectorInst & Shuf,InstCombiner::BuilderTy & Builder)2051 static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
2052                                        InstCombiner::BuilderTy &Builder) {
2053   // This must be a narrowing identity shuffle. It extracts the 1st N elements
2054   // of the 1st vector operand of a shuffle.
2055   if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract())
2056     return nullptr;
2057 
2058   // The vector being shuffled must be a vector select that we can eliminate.
2059   // TODO: The one-use requirement could be eased if X and/or Y are constants.
2060   Value *Cond, *X, *Y;
2061   if (!match(Shuf.getOperand(0),
2062              m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y)))))
2063     return nullptr;
2064 
2065   // We need a narrow condition value. It must be extended with undef elements
2066   // and have the same number of elements as this shuffle.
2067   unsigned NarrowNumElts =
2068       cast<FixedVectorType>(Shuf.getType())->getNumElements();
2069   Value *NarrowCond;
2070   if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) ||
2071       cast<FixedVectorType>(NarrowCond->getType())->getNumElements() !=
2072           NarrowNumElts ||
2073       !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())
2074     return nullptr;
2075 
2076   // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) -->
2077   // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask)
2078   Value *NarrowX = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());
2079   Value *NarrowY = Builder.CreateShuffleVector(Y, Shuf.getShuffleMask());
2080   return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
2081 }
2082 
2083 /// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
foldIdentityExtractShuffle(ShuffleVectorInst & Shuf)2084 static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
2085   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
2086   if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1))
2087     return nullptr;
2088 
2089   Value *X, *Y;
2090   ArrayRef<int> Mask;
2091   if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask))))
2092     return nullptr;
2093 
2094   // Be conservative with shuffle transforms. If we can't kill the 1st shuffle,
2095   // then combining may result in worse codegen.
2096   if (!Op0->hasOneUse())
2097     return nullptr;
2098 
2099   // We are extracting a subvector from a shuffle. Remove excess elements from
2100   // the 1st shuffle mask to eliminate the extract.
2101   //
2102   // This transform is conservatively limited to identity extracts because we do
2103   // not allow arbitrary shuffle mask creation as a target-independent transform
2104   // (because we can't guarantee that will lower efficiently).
2105   //
2106   // If the extracting shuffle has an undef mask element, it transfers to the
2107   // new shuffle mask. Otherwise, copy the original mask element. Example:
2108   //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
2109   //   shuf X, Y, <C0, undef, C2, undef>
2110   unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
2111   SmallVector<int, 16> NewMask(NumElts);
2112   assert(NumElts < Mask.size() &&
2113          "Identity with extract must have less elements than its inputs");
2114 
2115   for (unsigned i = 0; i != NumElts; ++i) {
2116     int ExtractMaskElt = Shuf.getMaskValue(i);
2117     int MaskElt = Mask[i];
2118     NewMask[i] = ExtractMaskElt == UndefMaskElem ? ExtractMaskElt : MaskElt;
2119   }
2120   return new ShuffleVectorInst(X, Y, NewMask);
2121 }
2122 
2123 /// Try to replace a shuffle with an insertelement or try to replace a shuffle
2124 /// operand with the operand of an insertelement.
foldShuffleWithInsert(ShuffleVectorInst & Shuf,InstCombinerImpl & IC)2125 static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
2126                                           InstCombinerImpl &IC) {
2127   Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
2128   SmallVector<int, 16> Mask;
2129   Shuf.getShuffleMask(Mask);
2130 
2131   // The shuffle must not change vector sizes.
2132   // TODO: This restriction could be removed if the insert has only one use
2133   //       (because the transform would require a new length-changing shuffle).
2134   int NumElts = Mask.size();
2135   if (NumElts != (int)(cast<FixedVectorType>(V0->getType())->getNumElements()))
2136     return nullptr;
2137 
2138   // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
2139   // not be able to handle it there if the insertelement has >1 use.
2140   // If the shuffle has an insertelement operand but does not choose the
2141   // inserted scalar element from that value, then we can replace that shuffle
2142   // operand with the source vector of the insertelement.
2143   Value *X;
2144   uint64_t IdxC;
2145   if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
2146     // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
2147     if (!is_contained(Mask, (int)IdxC))
2148       return IC.replaceOperand(Shuf, 0, X);
2149   }
2150   if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
2151     // Offset the index constant by the vector width because we are checking for
2152     // accesses to the 2nd vector input of the shuffle.
2153     IdxC += NumElts;
2154     // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
2155     if (!is_contained(Mask, (int)IdxC))
2156       return IC.replaceOperand(Shuf, 1, X);
2157   }
2158 
2159   // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
2160   auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
2161     // We need an insertelement with a constant index.
2162     if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar),
2163                                m_ConstantInt(IndexC))))
2164       return false;
2165 
2166     // Test the shuffle mask to see if it splices the inserted scalar into the
2167     // operand 1 vector of the shuffle.
2168     int NewInsIndex = -1;
2169     for (int i = 0; i != NumElts; ++i) {
2170       // Ignore undef mask elements.
2171       if (Mask[i] == -1)
2172         continue;
2173 
2174       // The shuffle takes elements of operand 1 without lane changes.
2175       if (Mask[i] == NumElts + i)
2176         continue;
2177 
2178       // The shuffle must choose the inserted scalar exactly once.
2179       if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())
2180         return false;
2181 
2182       // The shuffle is placing the inserted scalar into element i.
2183       NewInsIndex = i;
2184     }
2185 
2186     assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");
2187 
2188     // Index is updated to the potentially translated insertion lane.
2189     IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex);
2190     return true;
2191   };
2192 
2193   // If the shuffle is unnecessary, insert the scalar operand directly into
2194   // operand 1 of the shuffle. Example:
2195   // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0
2196   Value *Scalar;
2197   ConstantInt *IndexC;
2198   if (isShufflingScalarIntoOp1(Scalar, IndexC))
2199     return InsertElementInst::Create(V1, Scalar, IndexC);
2200 
2201   // Try again after commuting shuffle. Example:
2202   // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->
2203   // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3
2204   std::swap(V0, V1);
2205   ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
2206   if (isShufflingScalarIntoOp1(Scalar, IndexC))
2207     return InsertElementInst::Create(V1, Scalar, IndexC);
2208 
2209   return nullptr;
2210 }
2211 
foldIdentityPaddedShuffles(ShuffleVectorInst & Shuf)2212 static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
2213   // Match the operands as identity with padding (also known as concatenation
2214   // with undef) shuffles of the same source type. The backend is expected to
2215   // recreate these concatenations from a shuffle of narrow operands.
2216   auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0));
2217   auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1));
2218   if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() ||
2219       !Shuffle1 || !Shuffle1->isIdentityWithPadding())
2220     return nullptr;
2221 
2222   // We limit this transform to power-of-2 types because we expect that the
2223   // backend can convert the simplified IR patterns to identical nodes as the
2224   // original IR.
2225   // TODO: If we can verify the same behavior for arbitrary types, the
2226   //       power-of-2 checks can be removed.
2227   Value *X = Shuffle0->getOperand(0);
2228   Value *Y = Shuffle1->getOperand(0);
2229   if (X->getType() != Y->getType() ||
2230       !isPowerOf2_32(cast<FixedVectorType>(Shuf.getType())->getNumElements()) ||
2231       !isPowerOf2_32(
2232           cast<FixedVectorType>(Shuffle0->getType())->getNumElements()) ||
2233       !isPowerOf2_32(cast<FixedVectorType>(X->getType())->getNumElements()) ||
2234       isa<UndefValue>(X) || isa<UndefValue>(Y))
2235     return nullptr;
2236   assert(isa<UndefValue>(Shuffle0->getOperand(1)) &&
2237          isa<UndefValue>(Shuffle1->getOperand(1)) &&
2238          "Unexpected operand for identity shuffle");
2239 
2240   // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source
2241   // operands directly by adjusting the shuffle mask to account for the narrower
2242   // types:
2243   // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
2244   int NarrowElts = cast<FixedVectorType>(X->getType())->getNumElements();
2245   int WideElts = cast<FixedVectorType>(Shuffle0->getType())->getNumElements();
2246   assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
2247 
2248   ArrayRef<int> Mask = Shuf.getShuffleMask();
2249   SmallVector<int, 16> NewMask(Mask.size(), -1);
2250   for (int i = 0, e = Mask.size(); i != e; ++i) {
2251     if (Mask[i] == -1)
2252       continue;
2253 
2254     // If this shuffle is choosing an undef element from 1 of the sources, that
2255     // element is undef.
2256     if (Mask[i] < WideElts) {
2257       if (Shuffle0->getMaskValue(Mask[i]) == -1)
2258         continue;
2259     } else {
2260       if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1)
2261         continue;
2262     }
2263 
2264     // If this shuffle is choosing from the 1st narrow op, the mask element is
2265     // the same. If this shuffle is choosing from the 2nd narrow op, the mask
2266     // element is offset down to adjust for the narrow vector widths.
2267     if (Mask[i] < WideElts) {
2268       assert(Mask[i] < NarrowElts && "Unexpected shuffle mask");
2269       NewMask[i] = Mask[i];
2270     } else {
2271       assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask");
2272       NewMask[i] = Mask[i] - (WideElts - NarrowElts);
2273     }
2274   }
2275   return new ShuffleVectorInst(X, Y, NewMask);
2276 }
2277 
visitShuffleVectorInst(ShuffleVectorInst & SVI)2278 Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
2279   Value *LHS = SVI.getOperand(0);
2280   Value *RHS = SVI.getOperand(1);
2281   SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
2282   if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
2283                                           SVI.getType(), ShufQuery))
2284     return replaceInstUsesWith(SVI, V);
2285 
2286   // Bail out for scalable vectors
2287   if (isa<ScalableVectorType>(LHS->getType()))
2288     return nullptr;
2289 
2290   // shuffle x, x, mask --> shuffle x, undef, mask'
2291   unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements();
2292   unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements();
2293   ArrayRef<int> Mask = SVI.getShuffleMask();
2294   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
2295 
2296   // Peek through a bitcasted shuffle operand by scaling the mask. If the
2297   // simulated shuffle can simplify, then this shuffle is unnecessary:
2298   // shuf (bitcast X), undef, Mask --> bitcast X'
2299   // TODO: This could be extended to allow length-changing shuffles.
2300   //       The transform might also be obsoleted if we allowed canonicalization
2301   //       of bitcasted shuffles.
2302   Value *X;
2303   if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
2304       X->getType()->isVectorTy() && VWidth == LHSWidth) {
2305     // Try to create a scaled mask constant.
2306     auto *XType = cast<FixedVectorType>(X->getType());
2307     unsigned XNumElts = XType->getNumElements();
2308     SmallVector<int, 16> ScaledMask;
2309     if (XNumElts >= VWidth) {
2310       assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast");
2311       narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask);
2312     } else {
2313       assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast");
2314       if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask))
2315         ScaledMask.clear();
2316     }
2317     if (!ScaledMask.empty()) {
2318       // If the shuffled source vector simplifies, cast that value to this
2319       // shuffle's type.
2320       if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType),
2321                                               ScaledMask, XType, ShufQuery))
2322         return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());
2323     }
2324   }
2325 
2326   if (LHS == RHS) {
2327     assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?");
2328     // Remap any references to RHS to use LHS.
2329     SmallVector<int, 16> Elts;
2330     for (unsigned i = 0; i != VWidth; ++i) {
2331       // Propagate undef elements or force mask to LHS.
2332       if (Mask[i] < 0)
2333         Elts.push_back(UndefMaskElem);
2334       else
2335         Elts.push_back(Mask[i] % LHSWidth);
2336     }
2337     return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts);
2338   }
2339 
2340   // shuffle undef, x, mask --> shuffle x, undef, mask'
2341   if (isa<UndefValue>(LHS)) {
2342     SVI.commute();
2343     return &SVI;
2344   }
2345 
2346   if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
2347     return I;
2348 
2349   if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
2350     return I;
2351 
2352   if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
2353     return I;
2354 
2355   if (Instruction *I = narrowVectorSelect(SVI, Builder))
2356     return I;
2357 
2358   APInt UndefElts(VWidth, 0);
2359   APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
2360   if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
2361     if (V != &SVI)
2362       return replaceInstUsesWith(SVI, V);
2363     return &SVI;
2364   }
2365 
2366   if (Instruction *I = foldIdentityExtractShuffle(SVI))
2367     return I;
2368 
2369   // These transforms have the potential to lose undef knowledge, so they are
2370   // intentionally placed after SimplifyDemandedVectorElts().
2371   if (Instruction *I = foldShuffleWithInsert(SVI, *this))
2372     return I;
2373   if (Instruction *I = foldIdentityPaddedShuffles(SVI))
2374     return I;
2375 
2376   if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) {
2377     Value *V = evaluateInDifferentElementOrder(LHS, Mask);
2378     return replaceInstUsesWith(SVI, V);
2379   }
2380 
2381   // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
2382   // a non-vector type. We can instead bitcast the original vector followed by
2383   // an extract of the desired element:
2384   //
2385   //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
2386   //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2387   //   %1 = bitcast <4 x i8> %sroa to i32
2388   // Becomes:
2389   //   %bc = bitcast <16 x i8> %in to <4 x i32>
2390   //   %ext = extractelement <4 x i32> %bc, i32 0
2391   //
2392   // If the shuffle is extracting a contiguous range of values from the input
2393   // vector then each use which is a bitcast of the extracted size can be
2394   // replaced. This will work if the vector types are compatible, and the begin
2395   // index is aligned to a value in the casted vector type. If the begin index
2396   // isn't aligned then we can shuffle the original vector (keeping the same
2397   // vector type) before extracting.
2398   //
2399   // This code will bail out if the target type is fundamentally incompatible
2400   // with vectors of the source type.
2401   //
2402   // Example of <16 x i8>, target type i32:
2403   // Index range [4,8):         v-----------v Will work.
2404   //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
2405   //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
2406   //     <4 x i32>: |           |           |           |           |
2407   //                +-----------+-----------+-----------+-----------+
2408   // Index range [6,10):              ^-----------^ Needs an extra shuffle.
2409   // Target type i40:           ^--------------^ Won't work, bail.
2410   bool MadeChange = false;
2411   if (isShuffleExtractingFromLHS(SVI, Mask)) {
2412     Value *V = LHS;
2413     unsigned MaskElems = Mask.size();
2414     auto *SrcTy = cast<FixedVectorType>(V->getType());
2415     unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize();
2416     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
2417     assert(SrcElemBitWidth && "vector elements must have a bitwidth");
2418     unsigned SrcNumElems = SrcTy->getNumElements();
2419     SmallVector<BitCastInst *, 8> BCs;
2420     DenseMap<Type *, Value *> NewBCs;
2421     for (User *U : SVI.users())
2422       if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
2423         if (!BC->use_empty())
2424           // Only visit bitcasts that weren't previously handled.
2425           BCs.push_back(BC);
2426     for (BitCastInst *BC : BCs) {
2427       unsigned BegIdx = Mask.front();
2428       Type *TgtTy = BC->getDestTy();
2429       unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
2430       if (!TgtElemBitWidth)
2431         continue;
2432       unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
2433       bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
2434       bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
2435       if (!VecBitWidthsEqual)
2436         continue;
2437       if (!VectorType::isValidElementType(TgtTy))
2438         continue;
2439       auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);
2440       if (!BegIsAligned) {
2441         // Shuffle the input so [0,NumElements) contains the output, and
2442         // [NumElems,SrcNumElems) is undef.
2443         SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);
2444         for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
2445           ShuffleMask[I] = Idx;
2446         V = Builder.CreateShuffleVector(V, ShuffleMask,
2447                                         SVI.getName() + ".extract");
2448         BegIdx = 0;
2449       }
2450       unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
2451       assert(SrcElemsPerTgtElem);
2452       BegIdx /= SrcElemsPerTgtElem;
2453       bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
2454       auto *NewBC =
2455           BCAlreadyExists
2456               ? NewBCs[CastSrcTy]
2457               : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
2458       if (!BCAlreadyExists)
2459         NewBCs[CastSrcTy] = NewBC;
2460       auto *Ext = Builder.CreateExtractElement(
2461           NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
2462       // The shufflevector isn't being replaced: the bitcast that used it
2463       // is. InstCombine will visit the newly-created instructions.
2464       replaceInstUsesWith(*BC, Ext);
2465       MadeChange = true;
2466     }
2467   }
2468 
2469   // If the LHS is a shufflevector itself, see if we can combine it with this
2470   // one without producing an unusual shuffle.
2471   // Cases that might be simplified:
2472   // 1.
2473   // x1=shuffle(v1,v2,mask1)
2474   //  x=shuffle(x1,undef,mask)
2475   //        ==>
2476   //  x=shuffle(v1,undef,newMask)
2477   // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
2478   // 2.
2479   // x1=shuffle(v1,undef,mask1)
2480   //  x=shuffle(x1,x2,mask)
2481   // where v1.size() == mask1.size()
2482   //        ==>
2483   //  x=shuffle(v1,x2,newMask)
2484   // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
2485   // 3.
2486   // x2=shuffle(v2,undef,mask2)
2487   //  x=shuffle(x1,x2,mask)
2488   // where v2.size() == mask2.size()
2489   //        ==>
2490   //  x=shuffle(x1,v2,newMask)
2491   // newMask[i] = (mask[i] < x1.size())
2492   //              ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
2493   // 4.
2494   // x1=shuffle(v1,undef,mask1)
2495   // x2=shuffle(v2,undef,mask2)
2496   //  x=shuffle(x1,x2,mask)
2497   // where v1.size() == v2.size()
2498   //        ==>
2499   //  x=shuffle(v1,v2,newMask)
2500   // newMask[i] = (mask[i] < x1.size())
2501   //              ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
2502   //
2503   // Here we are really conservative:
2504   // we are absolutely afraid of producing a shuffle mask not in the input
2505   // program, because the code gen may not be smart enough to turn a merged
2506   // shuffle into two specific shuffles: it may produce worse code.  As such,
2507   // we only merge two shuffles if the result is either a splat or one of the
2508   // input shuffle masks.  In this case, merging the shuffles just removes
2509   // one instruction, which we know is safe.  This is good for things like
2510   // turning: (splat(splat)) -> splat, or
2511   // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
2512   ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
2513   ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
2514   if (LHSShuffle)
2515     if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
2516       LHSShuffle = nullptr;
2517   if (RHSShuffle)
2518     if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
2519       RHSShuffle = nullptr;
2520   if (!LHSShuffle && !RHSShuffle)
2521     return MadeChange ? &SVI : nullptr;
2522 
2523   Value* LHSOp0 = nullptr;
2524   Value* LHSOp1 = nullptr;
2525   Value* RHSOp0 = nullptr;
2526   unsigned LHSOp0Width = 0;
2527   unsigned RHSOp0Width = 0;
2528   if (LHSShuffle) {
2529     LHSOp0 = LHSShuffle->getOperand(0);
2530     LHSOp1 = LHSShuffle->getOperand(1);
2531     LHSOp0Width = cast<FixedVectorType>(LHSOp0->getType())->getNumElements();
2532   }
2533   if (RHSShuffle) {
2534     RHSOp0 = RHSShuffle->getOperand(0);
2535     RHSOp0Width = cast<FixedVectorType>(RHSOp0->getType())->getNumElements();
2536   }
2537   Value* newLHS = LHS;
2538   Value* newRHS = RHS;
2539   if (LHSShuffle) {
2540     // case 1
2541     if (isa<UndefValue>(RHS)) {
2542       newLHS = LHSOp0;
2543       newRHS = LHSOp1;
2544     }
2545     // case 2 or 4
2546     else if (LHSOp0Width == LHSWidth) {
2547       newLHS = LHSOp0;
2548     }
2549   }
2550   // case 3 or 4
2551   if (RHSShuffle && RHSOp0Width == LHSWidth) {
2552     newRHS = RHSOp0;
2553   }
2554   // case 4
2555   if (LHSOp0 == RHSOp0) {
2556     newLHS = LHSOp0;
2557     newRHS = nullptr;
2558   }
2559 
2560   if (newLHS == LHS && newRHS == RHS)
2561     return MadeChange ? &SVI : nullptr;
2562 
2563   ArrayRef<int> LHSMask;
2564   ArrayRef<int> RHSMask;
2565   if (newLHS != LHS)
2566     LHSMask = LHSShuffle->getShuffleMask();
2567   if (RHSShuffle && newRHS != RHS)
2568     RHSMask = RHSShuffle->getShuffleMask();
2569 
2570   unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
2571   SmallVector<int, 16> newMask;
2572   bool isSplat = true;
2573   int SplatElt = -1;
2574   // Create a new mask for the new ShuffleVectorInst so that the new
2575   // ShuffleVectorInst is equivalent to the original one.
2576   for (unsigned i = 0; i < VWidth; ++i) {
2577     int eltMask;
2578     if (Mask[i] < 0) {
2579       // This element is an undef value.
2580       eltMask = -1;
2581     } else if (Mask[i] < (int)LHSWidth) {
2582       // This element is from left hand side vector operand.
2583       //
2584       // If LHS is going to be replaced (case 1, 2, or 4), calculate the
2585       // new mask value for the element.
2586       if (newLHS != LHS) {
2587         eltMask = LHSMask[Mask[i]];
2588         // If the value selected is an undef value, explicitly specify it
2589         // with a -1 mask value.
2590         if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
2591           eltMask = -1;
2592       } else
2593         eltMask = Mask[i];
2594     } else {
2595       // This element is from right hand side vector operand
2596       //
2597       // If the value selected is an undef value, explicitly specify it
2598       // with a -1 mask value. (case 1)
2599       if (isa<UndefValue>(RHS))
2600         eltMask = -1;
2601       // If RHS is going to be replaced (case 3 or 4), calculate the
2602       // new mask value for the element.
2603       else if (newRHS != RHS) {
2604         eltMask = RHSMask[Mask[i]-LHSWidth];
2605         // If the value selected is an undef value, explicitly specify it
2606         // with a -1 mask value.
2607         if (eltMask >= (int)RHSOp0Width) {
2608           assert(isa<UndefValue>(RHSShuffle->getOperand(1))
2609                  && "should have been check above");
2610           eltMask = -1;
2611         }
2612       } else
2613         eltMask = Mask[i]-LHSWidth;
2614 
2615       // If LHS's width is changed, shift the mask value accordingly.
2616       // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any
2617       // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
2618       // If newRHS == newLHS, we want to remap any references from newRHS to
2619       // newLHS so that we can properly identify splats that may occur due to
2620       // obfuscation across the two vectors.
2621       if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
2622         eltMask += newLHSWidth;
2623     }
2624 
2625     // Check if this could still be a splat.
2626     if (eltMask >= 0) {
2627       if (SplatElt >= 0 && SplatElt != eltMask)
2628         isSplat = false;
2629       SplatElt = eltMask;
2630     }
2631 
2632     newMask.push_back(eltMask);
2633   }
2634 
2635   // If the result mask is equal to one of the original shuffle masks,
2636   // or is a splat, do the replacement.
2637   if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
2638     if (!newRHS)
2639       newRHS = UndefValue::get(newLHS->getType());
2640     return new ShuffleVectorInst(newLHS, newRHS, newMask);
2641   }
2642 
2643   return MadeChange ? &SVI : nullptr;
2644 }
2645