• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
3; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
8
9@g16 = external global i16
10
11define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
12; X86-SSE-LABEL: pinsrd_1:
13; X86-SSE:       ## %bb.0:
14; X86-SSE-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x44,0x24,0x04,0x01]
15; X86-SSE-NEXT:    retl ## encoding: [0xc3]
16;
17; X86-AVX1-LABEL: pinsrd_1:
18; X86-AVX1:       ## %bb.0:
19; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01]
20; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
21;
22; X86-AVX512-LABEL: pinsrd_1:
23; X86-AVX512:       ## %bb.0:
24; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01]
25; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
26;
27; X64-SSE-LABEL: pinsrd_1:
28; X64-SSE:       ## %bb.0:
29; X64-SSE-NEXT:    pinsrd $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0xc7,0x01]
30; X64-SSE-NEXT:    retq ## encoding: [0xc3]
31;
32; X64-AVX1-LABEL: pinsrd_1:
33; X64-AVX1:       ## %bb.0:
34; X64-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
35; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
36;
37; X64-AVX512-LABEL: pinsrd_1:
38; X64-AVX512:       ## %bb.0:
39; X64-AVX512-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01]
40; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
41  %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
42  ret <4 x i32> %tmp1
43}
44
45define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
46; X86-SSE-LABEL: pinsrb_1:
47; X86-SSE:       ## %bb.0:
48; X86-SSE-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0x44,0x24,0x04,0x01]
49; X86-SSE-NEXT:    retl ## encoding: [0xc3]
50;
51; X86-AVX1-LABEL: pinsrb_1:
52; X86-AVX1:       ## %bb.0:
53; X86-AVX1-NEXT:    vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01]
54; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
55;
56; X86-AVX512-LABEL: pinsrb_1:
57; X86-AVX512:       ## %bb.0:
58; X86-AVX512-NEXT:    vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01]
59; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
60;
61; X64-SSE-LABEL: pinsrb_1:
62; X64-SSE:       ## %bb.0:
63; X64-SSE-NEXT:    pinsrb $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0xc7,0x01]
64; X64-SSE-NEXT:    retq ## encoding: [0xc3]
65;
66; X64-AVX1-LABEL: pinsrb_1:
67; X64-AVX1:       ## %bb.0:
68; X64-AVX1-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01]
69; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
70;
71; X64-AVX512-LABEL: pinsrb_1:
72; X64-AVX512:       ## %bb.0:
73; X64-AVX512-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01]
74; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
75  %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
76  ret <16 x i8> %tmp1
77}
78
79define <2 x i64> @pmovzxbq_1() nounwind {
80; X86-SSE-LABEL: pmovzxbq_1:
81; X86-SSE:       ## %bb.0: ## %entry
82; X86-SSE-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
83; X86-SSE-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
84; X86-SSE-NEXT:    pmovzxbq (%eax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00]
85; X86-SSE-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
86; X86-SSE-NEXT:    retl ## encoding: [0xc3]
87;
88; X86-AVX1-LABEL: pmovzxbq_1:
89; X86-AVX1:       ## %bb.0: ## %entry
90; X86-AVX1-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
91; X86-AVX1-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
92; X86-AVX1-NEXT:    vpmovzxbq (%eax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
93; X86-AVX1-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
94; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
95;
96; X86-AVX512-LABEL: pmovzxbq_1:
97; X86-AVX512:       ## %bb.0: ## %entry
98; X86-AVX512-NEXT:    movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
99; X86-AVX512-NEXT:    ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
100; X86-AVX512-NEXT:    vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
101; X86-AVX512-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
102; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
103;
104; X64-SSE-LABEL: pmovzxbq_1:
105; X64-SSE:       ## %bb.0: ## %entry
106; X64-SSE-NEXT:    movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
107; X64-SSE-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
108; X64-SSE-NEXT:    pmovzxbq (%rax), %xmm0 ## encoding: [0x66,0x0f,0x38,0x32,0x00]
109; X64-SSE-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
110; X64-SSE-NEXT:    retq ## encoding: [0xc3]
111;
112; X64-AVX1-LABEL: pmovzxbq_1:
113; X64-AVX1:       ## %bb.0: ## %entry
114; X64-AVX1-NEXT:    movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
115; X64-AVX1-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
116; X64-AVX1-NEXT:    vpmovzxbq (%rax), %xmm0 ## encoding: [0xc4,0xe2,0x79,0x32,0x00]
117; X64-AVX1-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
118; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
119;
120; X64-AVX512-LABEL: pmovzxbq_1:
121; X64-AVX512:       ## %bb.0: ## %entry
122; X64-AVX512-NEXT:    movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
123; X64-AVX512-NEXT:    ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
124; X64-AVX512-NEXT:    vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
125; X64-AVX512-NEXT:    ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
126; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
127entry:
128	%0 = load i16, i16* @g16, align 2		; <i16> [#uses=1]
129	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
130	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
131	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
132	ret <2 x i64> %3
133}
134
135declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
136
137define i32 @extractps_1(<4 x float> %v) nounwind {
138; SSE-LABEL: extractps_1:
139; SSE:       ## %bb.0:
140; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
141; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
142;
143; AVX1-LABEL: extractps_1:
144; AVX1:       ## %bb.0:
145; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
146; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
147;
148; AVX512-LABEL: extractps_1:
149; AVX512:       ## %bb.0:
150; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
151; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
152  %s = extractelement <4 x float> %v, i32 3
153  %i = bitcast float %s to i32
154  ret i32 %i
155}
156define i32 @extractps_2(<4 x float> %v) nounwind {
157; SSE-LABEL: extractps_2:
158; SSE:       ## %bb.0:
159; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
160; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
161;
162; AVX1-LABEL: extractps_2:
163; AVX1:       ## %bb.0:
164; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
165; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
166;
167; AVX512-LABEL: extractps_2:
168; AVX512:       ## %bb.0:
169; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
170; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
171  %t = bitcast <4 x float> %v to <4 x i32>
172  %s = extractelement <4 x i32> %t, i32 3
173  ret i32 %s
174}
175
176
177; The non-store form of extractps puts its result into a GPR.
178; This makes it suitable for an extract from a <4 x float> that
179; is bitcasted to i32, but unsuitable for much of anything else.
180
181define float @ext_1(<4 x float> %v) nounwind {
182; X86-SSE-LABEL: ext_1:
183; X86-SSE:       ## %bb.0:
184; X86-SSE-NEXT:    pushl %eax ## encoding: [0x50]
185; X86-SSE-NEXT:    shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]
186; X86-SSE-NEXT:    ## xmm0 = xmm0[3,1,2,3]
187; X86-SSE-NEXT:    addss LCPI5_0, %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
188; X86-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
189; X86-SSE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
190; X86-SSE-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
191; X86-SSE-NEXT:    popl %eax ## encoding: [0x58]
192; X86-SSE-NEXT:    retl ## encoding: [0xc3]
193;
194; X86-AVX1-LABEL: ext_1:
195; X86-AVX1:       ## %bb.0:
196; X86-AVX1-NEXT:    pushl %eax ## encoding: [0x50]
197; X86-AVX1-NEXT:    vpermilps $231, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
198; X86-AVX1-NEXT:    ## xmm0 = xmm0[3,1,2,3]
199; X86-AVX1-NEXT:    vaddss LCPI5_0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
200; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
201; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
202; X86-AVX1-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
203; X86-AVX1-NEXT:    popl %eax ## encoding: [0x58]
204; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
205;
206; X86-AVX512-LABEL: ext_1:
207; X86-AVX512:       ## %bb.0:
208; X86-AVX512-NEXT:    pushl %eax ## encoding: [0x50]
209; X86-AVX512-NEXT:    vpermilps $231, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
210; X86-AVX512-NEXT:    ## xmm0 = xmm0[3,1,2,3]
211; X86-AVX512-NEXT:    vaddss LCPI5_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
212; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
213; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
214; X86-AVX512-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
215; X86-AVX512-NEXT:    popl %eax ## encoding: [0x58]
216; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
217;
218; X64-SSE-LABEL: ext_1:
219; X64-SSE:       ## %bb.0:
220; X64-SSE-NEXT:    shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]
221; X64-SSE-NEXT:    ## xmm0 = xmm0[3,1,2,3]
222; X64-SSE-NEXT:    addss {{.*}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
223; X64-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte
224; X64-SSE-NEXT:    retq ## encoding: [0xc3]
225;
226; X64-AVX1-LABEL: ext_1:
227; X64-AVX1:       ## %bb.0:
228; X64-AVX1-NEXT:    vpermilps $231, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
229; X64-AVX1-NEXT:    ## xmm0 = xmm0[3,1,2,3]
230; X64-AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
231; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte
232; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
233;
234; X64-AVX512-LABEL: ext_1:
235; X64-AVX512:       ## %bb.0:
236; X64-AVX512-NEXT:    vpermilps $231, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
237; X64-AVX512-NEXT:    ## xmm0 = xmm0[3,1,2,3]
238; X64-AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A]
239; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte
240; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
241  %s = extractelement <4 x float> %v, i32 3
242  %t = fadd float %s, 1.0
243  ret float %t
244}
245
246define float @ext_2(<4 x float> %v) nounwind {
247; X86-SSE-LABEL: ext_2:
248; X86-SSE:       ## %bb.0:
249; X86-SSE-NEXT:    pushl %eax ## encoding: [0x50]
250; X86-SSE-NEXT:    shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]
251; X86-SSE-NEXT:    ## xmm0 = xmm0[3,1,2,3]
252; X86-SSE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
253; X86-SSE-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
254; X86-SSE-NEXT:    popl %eax ## encoding: [0x58]
255; X86-SSE-NEXT:    retl ## encoding: [0xc3]
256;
257; X86-AVX1-LABEL: ext_2:
258; X86-AVX1:       ## %bb.0:
259; X86-AVX1-NEXT:    pushl %eax ## encoding: [0x50]
260; X86-AVX1-NEXT:    vpermilps $231, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
261; X86-AVX1-NEXT:    ## xmm0 = xmm0[3,1,2,3]
262; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
263; X86-AVX1-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
264; X86-AVX1-NEXT:    popl %eax ## encoding: [0x58]
265; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
266;
267; X86-AVX512-LABEL: ext_2:
268; X86-AVX512:       ## %bb.0:
269; X86-AVX512-NEXT:    pushl %eax ## encoding: [0x50]
270; X86-AVX512-NEXT:    vpermilps $231, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
271; X86-AVX512-NEXT:    ## xmm0 = xmm0[3,1,2,3]
272; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
273; X86-AVX512-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
274; X86-AVX512-NEXT:    popl %eax ## encoding: [0x58]
275; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
276;
277; X64-SSE-LABEL: ext_2:
278; X64-SSE:       ## %bb.0:
279; X64-SSE-NEXT:    shufps $231, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xe7]
280; X64-SSE-NEXT:    ## xmm0 = xmm0[3,1,2,3]
281; X64-SSE-NEXT:    retq ## encoding: [0xc3]
282;
283; X64-AVX1-LABEL: ext_2:
284; X64-AVX1:       ## %bb.0:
285; X64-AVX1-NEXT:    vpermilps $231, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
286; X64-AVX1-NEXT:    ## xmm0 = xmm0[3,1,2,3]
287; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
288;
289; X64-AVX512-LABEL: ext_2:
290; X64-AVX512:       ## %bb.0:
291; X64-AVX512-NEXT:    vpermilps $231, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
292; X64-AVX512-NEXT:    ## xmm0 = xmm0[3,1,2,3]
293; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
294  %s = extractelement <4 x float> %v, i32 3
295  ret float %s
296}
297
298define i32 @ext_3(<4 x i32> %v) nounwind {
299; SSE-LABEL: ext_3:
300; SSE:       ## %bb.0:
301; SSE-NEXT:    extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03]
302; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
303;
304; AVX1-LABEL: ext_3:
305; AVX1:       ## %bb.0:
306; AVX1-NEXT:    vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
307; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
308;
309; AVX512-LABEL: ext_3:
310; AVX512:       ## %bb.0:
311; AVX512-NEXT:    vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03]
312; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
313  %i = extractelement <4 x i32> %v, i32 3
314  ret i32 %i
315}
316
317define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
318; SSE-LABEL: insertps_1:
319; SSE:       ## %bb.0:
320; SSE-NEXT:    insertps $21, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x15]
321; SSE-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
322; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
323;
324; AVX1-LABEL: insertps_1:
325; AVX1:       ## %bb.0:
326; AVX1-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15]
327; AVX1-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
328; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
329;
330; AVX512-LABEL: insertps_1:
331; AVX512:       ## %bb.0:
332; AVX512-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15]
333; AVX512-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
334; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
335  %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
336  ret <4 x float> %tmp1
337}
338
339declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
340
341; When optimizing for speed, prefer blendps over insertps even if it means we have to
342; generate a separate movss to load the scalar operand.
343define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
344; X86-SSE-LABEL: blendps_not_insertps_1:
345; X86-SSE:       ## %bb.0:
346; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
347; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
348; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
349; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
350; X86-SSE-NEXT:    retl ## encoding: [0xc3]
351;
352; X86-AVX1-LABEL: blendps_not_insertps_1:
353; X86-AVX1:       ## %bb.0:
354; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
355; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
356; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
357; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
358; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
359;
360; X86-AVX512-LABEL: blendps_not_insertps_1:
361; X86-AVX512:       ## %bb.0:
362; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
363; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
364; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
365; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
366; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
367;
368; X64-SSE-LABEL: blendps_not_insertps_1:
369; X64-SSE:       ## %bb.0:
370; X64-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
371; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
372; X64-SSE-NEXT:    retq ## encoding: [0xc3]
373;
374; X64-AVX-LABEL: blendps_not_insertps_1:
375; X64-AVX:       ## %bb.0:
376; X64-AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
377; X64-AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
378; X64-AVX-NEXT:    retq ## encoding: [0xc3]
379  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
380  ret <4 x float> %tmp1
381}
382
383; When optimizing for size, generate an insertps if there's a load fold opportunity.
384; The difference between i386 and x86-64 ABIs for the float operand means we should
385; generate an insertps for X86 but not for X64!
386define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
387; X86-SSE-LABEL: insertps_or_blendps:
388; X86-SSE:       ## %bb.0:
389; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
390; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
391; X86-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
392; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
393; X86-SSE-NEXT:    retl ## encoding: [0xc3]
394;
395; X86-AVX1-LABEL: insertps_or_blendps:
396; X86-AVX1:       ## %bb.0:
397; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
398; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
399; X86-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
400; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
401; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
402;
403; X86-AVX512-LABEL: insertps_or_blendps:
404; X86-AVX512:       ## %bb.0:
405; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
406; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
407; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
408; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
409; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
410;
411; X64-SSE-LABEL: insertps_or_blendps:
412; X64-SSE:       ## %bb.0:
413; X64-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
414; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
415; X64-SSE-NEXT:    retq ## encoding: [0xc3]
416;
417; X64-AVX1-LABEL: insertps_or_blendps:
418; X64-AVX1:       ## %bb.0:
419; X64-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
420; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
421; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
422;
423; X64-AVX512-LABEL: insertps_or_blendps:
424; X64-AVX512:       ## %bb.0:
425; X64-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
426; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
427; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
428  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
429  ret <4 x float> %tmp1
430}
431
432; An insert into the low 32-bits of a vector from the low 32-bits of another vector
433; is always just a blendps because blendps is never more expensive than insertps.
434define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
435; SSE-LABEL: blendps_not_insertps_2:
436; SSE:       ## %bb.0:
437; SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
438; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
439; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
440;
441; AVX-LABEL: blendps_not_insertps_2:
442; AVX:       ## %bb.0:
443; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
444; AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
445; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
446  %tmp2 = extractelement <4 x float> %t2, i32 0
447  %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
448  ret <4 x float> %tmp1
449}
450
451define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
452; SSE-LABEL: ptestz_1:
453; SSE:       ## %bb.0:
454; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
455; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
456; SSE-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
457; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
458;
459; AVX-LABEL: ptestz_1:
460; AVX:       ## %bb.0:
461; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
462; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
463; AVX-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
464; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
465  %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
466  ret i32 %tmp1
467}
468
469define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
470; SSE-LABEL: ptestz_2:
471; SSE:       ## %bb.0:
472; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
473; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
474; SSE-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
475; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
476;
477; AVX-LABEL: ptestz_2:
478; AVX:       ## %bb.0:
479; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
480; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
481; AVX-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
482; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
483  %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
484  ret i32 %tmp1
485}
486
487define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
488; SSE-LABEL: ptestz_3:
489; SSE:       ## %bb.0:
490; SSE-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
491; SSE-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
492; SSE-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
493; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
494;
495; AVX-LABEL: ptestz_3:
496; AVX:       ## %bb.0:
497; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
498; AVX-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
499; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
500; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
501  %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
502  ret i32 %tmp1
503}
504
505declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
506declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
507declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
508
509; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
510; pointless.
511define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
512; SSE-LABEL: buildvector:
513; SSE:       ## %bb.0: ## %entry
514; SSE-NEXT:    movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0]
515; SSE-NEXT:    ## xmm2 = xmm0[1,1,3,3]
516; SSE-NEXT:    movshdup %xmm1, %xmm3 ## encoding: [0xf3,0x0f,0x16,0xd9]
517; SSE-NEXT:    ## xmm3 = xmm1[1,1,3,3]
518; SSE-NEXT:    addss %xmm2, %xmm3 ## encoding: [0xf3,0x0f,0x58,0xda]
519; SSE-NEXT:    addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
520; SSE-NEXT:    insertps $16, %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc3,0x10]
521; SSE-NEXT:    ## xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
522; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
523;
524; AVX1-LABEL: buildvector:
525; AVX1:       ## %bb.0: ## %entry
526; AVX1-NEXT:    vmovshdup %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x16,0xd0]
527; AVX1-NEXT:    ## xmm2 = xmm0[1,1,3,3]
528; AVX1-NEXT:    vmovshdup %xmm1, %xmm3 ## encoding: [0xc5,0xfa,0x16,0xd9]
529; AVX1-NEXT:    ## xmm3 = xmm1[1,1,3,3]
530; AVX1-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xea,0x58,0xd3]
531; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
532; AVX1-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
533; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
534; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
535;
536; AVX512-LABEL: buildvector:
537; AVX512:       ## %bb.0: ## %entry
538; AVX512-NEXT:    vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0]
539; AVX512-NEXT:    ## xmm2 = xmm0[1,1,3,3]
540; AVX512-NEXT:    vmovshdup %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd9]
541; AVX512-NEXT:    ## xmm3 = xmm1[1,1,3,3]
542; AVX512-NEXT:    vaddss %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x58,0xd3]
543; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
544; AVX512-NEXT:    vinsertps $16, %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10]
545; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
546; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
547entry:
548  %tmp7 = extractelement <2 x float> %A, i32 0
549  %tmp5 = extractelement <2 x float> %A, i32 1
550  %tmp3 = extractelement <2 x float> %B, i32 0
551  %tmp1 = extractelement <2 x float> %B, i32 1
552  %add.r = fadd float %tmp7, %tmp3
553  %add.i = fadd float %tmp5, %tmp1
554  %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
555  %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
556  ret <2 x float> %tmp9
557}
558
559define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
560; X86-SSE-LABEL: insertps_from_shufflevector_1:
561; X86-SSE:       ## %bb.0: ## %entry
562; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
563; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
564; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
565; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
566; X86-SSE-NEXT:    retl ## encoding: [0xc3]
567;
568; X86-AVX1-LABEL: insertps_from_shufflevector_1:
569; X86-AVX1:       ## %bb.0: ## %entry
570; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
571; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
572; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
573; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
574; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
575;
576; X86-AVX512-LABEL: insertps_from_shufflevector_1:
577; X86-AVX512:       ## %bb.0: ## %entry
578; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
579; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
580; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
581; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
582; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
583;
584; X64-SSE-LABEL: insertps_from_shufflevector_1:
585; X64-SSE:       ## %bb.0: ## %entry
586; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
587; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
588; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
589; X64-SSE-NEXT:    retq ## encoding: [0xc3]
590;
591; X64-AVX1-LABEL: insertps_from_shufflevector_1:
592; X64-AVX1:       ## %bb.0: ## %entry
593; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
594; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
595; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
596; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
597;
598; X64-AVX512-LABEL: insertps_from_shufflevector_1:
599; X64-AVX512:       ## %bb.0: ## %entry
600; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
601; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
602; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
603; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
604entry:
605  %0 = load <4 x float>, <4 x float>* %pb, align 16
606  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
607  ret <4 x float> %vecinit6
608}
609
610define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
611; SSE-LABEL: insertps_from_shufflevector_2:
612; SSE:       ## %bb.0: ## %entry
613; SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
614; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
615; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
616;
617; AVX1-LABEL: insertps_from_shufflevector_2:
618; AVX1:       ## %bb.0: ## %entry
619; AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
620; AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
621; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
622;
623; AVX512-LABEL: insertps_from_shufflevector_2:
624; AVX512:       ## %bb.0: ## %entry
625; AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
626; AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
627; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
628entry:
629  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
630  ret <4 x float> %vecinit6
631}
632
633; For loading an i32 from memory into an xmm register we use pinsrd
634; instead of insertps
635define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
636; X86-SSE-LABEL: pinsrd_from_shufflevector_i32:
637; X86-SSE:       ## %bb.0: ## %entry
638; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
639; X86-SSE-NEXT:    pshufd $36, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x24]
640; X86-SSE-NEXT:    ## xmm1 = mem[0,1,2,0]
641; X86-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
642; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
643; X86-SSE-NEXT:    retl ## encoding: [0xc3]
644;
645; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32:
646; X86-AVX1:       ## %bb.0: ## %entry
647; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
648; X86-AVX1-NEXT:    vpermilps $36, (%eax), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x08,0x24]
649; X86-AVX1-NEXT:    ## xmm1 = mem[0,1,2,0]
650; X86-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
651; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
652; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
653;
654; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32:
655; X86-AVX512:       ## %bb.0: ## %entry
656; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
657; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08]
658; X86-AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
659; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
660; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
661;
662; X64-SSE-LABEL: pinsrd_from_shufflevector_i32:
663; X64-SSE:       ## %bb.0: ## %entry
664; X64-SSE-NEXT:    pshufd $36, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x24]
665; X64-SSE-NEXT:    ## xmm1 = mem[0,1,2,0]
666; X64-SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
667; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
668; X64-SSE-NEXT:    retq ## encoding: [0xc3]
669;
670; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32:
671; X64-AVX1:       ## %bb.0: ## %entry
672; X64-AVX1-NEXT:    vpermilps $36, (%rdi), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x24]
673; X64-AVX1-NEXT:    ## xmm1 = mem[0,1,2,0]
674; X64-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
675; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
676; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
677;
678; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32:
679; X64-AVX512:       ## %bb.0: ## %entry
680; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f]
681; X64-AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
682; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
683; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
684entry:
685  %0 = load <4 x i32>, <4 x i32>* %pb, align 16
686  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
687  ret <4 x i32> %vecinit6
688}
689
690define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
691; SSE-LABEL: insertps_from_shufflevector_i32_2:
692; SSE:       ## %bb.0: ## %entry
693; SSE-NEXT:    pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e]
694; SSE-NEXT:    ## xmm1 = xmm1[2,3,0,1]
695; SSE-NEXT:    pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c]
696; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
697; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
698;
699; AVX1-LABEL: insertps_from_shufflevector_i32_2:
700; AVX1:       ## %bb.0: ## %entry
701; AVX1-NEXT:    vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
702; AVX1-NEXT:    ## xmm1 = xmm1[2,3,0,1]
703; AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
704; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
705; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
706;
707; AVX512-LABEL: insertps_from_shufflevector_i32_2:
708; AVX512:       ## %bb.0: ## %entry
709; AVX512-NEXT:    vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
710; AVX512-NEXT:    ## xmm1 = xmm1[2,3,0,1]
711; AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
712; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
713; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
714entry:
715  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
716  ret <4 x i32> %vecinit6
717}
718
719define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
720; X86-SSE-LABEL: insertps_from_load_ins_elt_undef:
721; X86-SSE:       ## %bb.0:
722; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
723; X86-SSE-NEXT:    insertps $16, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x10]
724; X86-SSE-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
725; X86-SSE-NEXT:    retl ## encoding: [0xc3]
726;
727; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef:
728; X86-AVX1:       ## %bb.0:
729; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
730; X86-AVX1-NEXT:    vinsertps $16, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10]
731; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
732; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
733;
734; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef:
735; X86-AVX512:       ## %bb.0:
736; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
737; X86-AVX512-NEXT:    vinsertps $16, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10]
738; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
739; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
740;
741; X64-SSE-LABEL: insertps_from_load_ins_elt_undef:
742; X64-SSE:       ## %bb.0:
743; X64-SSE-NEXT:    insertps $16, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x10]
744; X64-SSE-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
745; X64-SSE-NEXT:    retq ## encoding: [0xc3]
746;
747; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef:
748; X64-AVX1:       ## %bb.0:
749; X64-AVX1-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10]
750; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
751; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
752;
753; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef:
754; X64-AVX512:       ## %bb.0:
755; X64-AVX512-NEXT:    vinsertps $16, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10]
756; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
757; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
758  %1 = load float, float* %b, align 4
759  %2 = insertelement <4 x float> undef, float %1, i32 0
760  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
761  ret <4 x float> %result
762}
763
764; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
765define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
766; X86-SSE-LABEL: insertps_from_load_ins_elt_undef_i32:
767; X86-SSE:       ## %bb.0:
768; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
769; X86-SSE-NEXT:    pinsrd $2, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x00,0x02]
770; X86-SSE-NEXT:    retl ## encoding: [0xc3]
771;
772; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32:
773; X86-AVX1:       ## %bb.0:
774; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
775; X86-AVX1-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02]
776; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
777;
778; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32:
779; X86-AVX512:       ## %bb.0:
780; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
781; X86-AVX512-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02]
782; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
783;
784; X64-SSE-LABEL: insertps_from_load_ins_elt_undef_i32:
785; X64-SSE:       ## %bb.0:
786; X64-SSE-NEXT:    pinsrd $2, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x07,0x02]
787; X64-SSE-NEXT:    retq ## encoding: [0xc3]
788;
789; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32:
790; X64-AVX1:       ## %bb.0:
791; X64-AVX1-NEXT:    vpinsrd $2, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02]
792; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
793;
794; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32:
795; X64-AVX512:       ## %bb.0:
796; X64-AVX512-NEXT:    vpinsrd $2, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02]
797; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
798  %1 = load i32, i32* %b, align 4
799  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
800  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
801  ret <4 x i32> %result
802}
803
804;;;;;; Shuffles optimizable with a single insertps or blend instruction
805define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
806; SSE-LABEL: shuf_XYZ0:
807; SSE:       ## %bb.0:
808; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
809; SSE-NEXT:    blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08]
810; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
811; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
812;
813; AVX1-LABEL: shuf_XYZ0:
814; AVX1:       ## %bb.0:
815; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
816; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
817; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
818; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
819;
820; AVX512-LABEL: shuf_XYZ0:
821; AVX512:       ## %bb.0:
822; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
823; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
824; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
825; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
826  %vecext = extractelement <4 x float> %x, i32 0
827  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
828  %vecext1 = extractelement <4 x float> %x, i32 1
829  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
830  %vecext3 = extractelement <4 x float> %x, i32 2
831  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
832  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
833  ret <4 x float> %vecinit5
834}
835
836define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
837; SSE-LABEL: shuf_XY00:
838; SSE:       ## %bb.0:
839; SSE-NEXT:    movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
840; SSE-NEXT:    ## xmm0 = xmm0[0],zero
841; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
842;
843; AVX1-LABEL: shuf_XY00:
844; AVX1:       ## %bb.0:
845; AVX1-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
846; AVX1-NEXT:    ## xmm0 = xmm0[0],zero
847; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
848;
849; AVX512-LABEL: shuf_XY00:
850; AVX512:       ## %bb.0:
851; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
852; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
853; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
854  %vecext = extractelement <4 x float> %x, i32 0
855  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
856  %vecext1 = extractelement <4 x float> %x, i32 1
857  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
858  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
859  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
860  ret <4 x float> %vecinit4
861}
862
863define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
864; SSE-LABEL: shuf_XYY0:
865; SSE:       ## %bb.0:
866; SSE-NEXT:    insertps $104, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x68]
867; SSE-NEXT:    ## xmm0 = xmm0[0,1,1],zero
868; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
869;
870; AVX1-LABEL: shuf_XYY0:
871; AVX1:       ## %bb.0:
872; AVX1-NEXT:    vinsertps $104, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68]
873; AVX1-NEXT:    ## xmm0 = xmm0[0,1,1],zero
874; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
875;
876; AVX512-LABEL: shuf_XYY0:
877; AVX512:       ## %bb.0:
878; AVX512-NEXT:    vinsertps $104, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68]
879; AVX512-NEXT:    ## xmm0 = xmm0[0,1,1],zero
880; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
881  %vecext = extractelement <4 x float> %x, i32 0
882  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
883  %vecext1 = extractelement <4 x float> %x, i32 1
884  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
885  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
886  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
887  ret <4 x float> %vecinit5
888}
889
890define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
891; SSE-LABEL: shuf_XYW0:
892; SSE:       ## %bb.0:
893; SSE-NEXT:    insertps $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xe8]
894; SSE-NEXT:    ## xmm0 = xmm0[0,1,3],zero
895; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
896;
897; AVX1-LABEL: shuf_XYW0:
898; AVX1:       ## %bb.0:
899; AVX1-NEXT:    vinsertps $232, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8]
900; AVX1-NEXT:    ## xmm0 = xmm0[0,1,3],zero
901; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
902;
903; AVX512-LABEL: shuf_XYW0:
904; AVX512:       ## %bb.0:
905; AVX512-NEXT:    vinsertps $232, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8]
906; AVX512-NEXT:    ## xmm0 = xmm0[0,1,3],zero
907; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
908  %vecext = extractelement <4 x float> %x, i32 0
909  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
910  %vecext1 = extractelement <4 x float> %x, i32 1
911  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
912  %vecext2 = extractelement <4 x float> %x, i32 3
913  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
914  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
915  ret <4 x float> %vecinit4
916}
917
918define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
919; SSE-LABEL: shuf_W00W:
920; SSE:       ## %bb.0:
921; SSE-NEXT:    insertps $198, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xc6]
922; SSE-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
923; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
924;
925; AVX1-LABEL: shuf_W00W:
926; AVX1:       ## %bb.0:
927; AVX1-NEXT:    vinsertps $198, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6]
928; AVX1-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
929; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
930;
931; AVX512-LABEL: shuf_W00W:
932; AVX512:       ## %bb.0:
933; AVX512-NEXT:    vinsertps $198, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6]
934; AVX512-NEXT:    ## xmm0 = xmm0[3],zero,zero,xmm0[3]
935; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
936  %vecext = extractelement <4 x float> %x, i32 3
937  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
938  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
939  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
940  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
941  ret <4 x float> %vecinit4
942}
943
944define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
945; SSE-LABEL: shuf_X00A:
946; SSE:       ## %bb.0:
947; SSE-NEXT:    insertps $54, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x36]
948; SSE-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
949; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
950;
951; AVX1-LABEL: shuf_X00A:
952; AVX1:       ## %bb.0:
953; AVX1-NEXT:    vinsertps $54, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36]
954; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
955; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
956;
957; AVX512-LABEL: shuf_X00A:
958; AVX512:       ## %bb.0:
959; AVX512-NEXT:    vinsertps $54, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36]
960; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm1[0]
961; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
962  %vecext = extractelement <4 x float> %x, i32 0
963  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
964  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
965  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
966  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
967  ret <4 x float> %vecinit4
968}
969
970define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
971; SSE-LABEL: shuf_X00X:
972; SSE:       ## %bb.0:
973; SSE-NEXT:    insertps $54, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x36]
974; SSE-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
975; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
976;
977; AVX1-LABEL: shuf_X00X:
978; AVX1:       ## %bb.0:
979; AVX1-NEXT:    vinsertps $54, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36]
980; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
981; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
982;
983; AVX512-LABEL: shuf_X00X:
984; AVX512:       ## %bb.0:
985; AVX512-NEXT:    vinsertps $54, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36]
986; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,zero,xmm0[0]
987; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
988  %vecext = extractelement <4 x float> %x, i32 0
989  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
990  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
991  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
992  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
993  ret <4 x float> %vecinit4
994}
995
996define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
997; SSE-LABEL: shuf_X0YC:
998; SSE:       ## %bb.0:
999; SSE-NEXT:    xorps %xmm2, %xmm2 ## encoding: [0x0f,0x57,0xd2]
1000; SSE-NEXT:    unpcklps %xmm2, %xmm0 ## encoding: [0x0f,0x14,0xc2]
1001; SSE-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1002; SSE-NEXT:    insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0]
1003; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1004; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1005;
1006; AVX1-LABEL: shuf_X0YC:
1007; AVX1:       ## %bb.0:
1008; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1009; AVX1-NEXT:    vunpcklps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc2]
1010; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1011; AVX1-NEXT:    vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
1012; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1013; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1014;
1015; AVX512-LABEL: shuf_X0YC:
1016; AVX512:       ## %bb.0:
1017; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2]
1018; AVX512-NEXT:    vunpcklps %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc2]
1019; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1020; AVX512-NEXT:    vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0]
1021; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[2]
1022; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1023  %vecext = extractelement <4 x float> %x, i32 0
1024  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1025  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
1026  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
1027  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
1028  ret <4 x float> %vecinit5
1029}
1030
1031define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
1032; SSE-LABEL: i32_shuf_XYZ0:
1033; SSE:       ## %bb.0:
1034; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
1035; SSE-NEXT:    blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08]
1036; SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1037; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1038;
1039; AVX1-LABEL: i32_shuf_XYZ0:
1040; AVX1:       ## %bb.0:
1041; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1042; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1043; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1044; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1045;
1046; AVX512-LABEL: i32_shuf_XYZ0:
1047; AVX512:       ## %bb.0:
1048; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1049; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1050; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1051; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1052  %vecext = extractelement <4 x i32> %x, i32 0
1053  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1054  %vecext1 = extractelement <4 x i32> %x, i32 1
1055  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1056  %vecext3 = extractelement <4 x i32> %x, i32 2
1057  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
1058  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
1059  ret <4 x i32> %vecinit5
1060}
1061
1062define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
1063; SSE-LABEL: i32_shuf_XY00:
1064; SSE:       ## %bb.0:
1065; SSE-NEXT:    movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0]
1066; SSE-NEXT:    ## xmm0 = xmm0[0],zero
1067; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1068;
1069; AVX1-LABEL: i32_shuf_XY00:
1070; AVX1:       ## %bb.0:
1071; AVX1-NEXT:    vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0]
1072; AVX1-NEXT:    ## xmm0 = xmm0[0],zero
1073; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1074;
1075; AVX512-LABEL: i32_shuf_XY00:
1076; AVX512:       ## %bb.0:
1077; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
1078; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
1079; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1080  %vecext = extractelement <4 x i32> %x, i32 0
1081  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1082  %vecext1 = extractelement <4 x i32> %x, i32 1
1083  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1084  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
1085  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
1086  ret <4 x i32> %vecinit4
1087}
1088
1089define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
1090; SSE-LABEL: i32_shuf_XYY0:
1091; SSE:       ## %bb.0:
1092; SSE-NEXT:    pshufd $212, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xd4]
1093; SSE-NEXT:    ## xmm1 = xmm0[0,1,1,3]
1094; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1095; SSE-NEXT:    pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f]
1096; SSE-NEXT:    ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1097; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1098;
1099; AVX1-LABEL: i32_shuf_XYY0:
1100; AVX1:       ## %bb.0:
1101; AVX1-NEXT:    vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
1102; AVX1-NEXT:    ## xmm0 = xmm0[0,1,1,3]
1103; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1104; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1105; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1106; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1107;
1108; AVX512-LABEL: i32_shuf_XYY0:
1109; AVX512:       ## %bb.0:
1110; AVX512-NEXT:    vpermilps $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4]
1111; AVX512-NEXT:    ## xmm0 = xmm0[0,1,1,3]
1112; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1113; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1114; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1115; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1116  %vecext = extractelement <4 x i32> %x, i32 0
1117  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1118  %vecext1 = extractelement <4 x i32> %x, i32 1
1119  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1120  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
1121  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
1122  ret <4 x i32> %vecinit5
1123}
1124
1125define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
1126; SSE-LABEL: i32_shuf_XYW0:
1127; SSE:       ## %bb.0:
1128; SSE-NEXT:    pshufd $244, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xf4]
1129; SSE-NEXT:    ## xmm1 = xmm0[0,1,3,3]
1130; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1131; SSE-NEXT:    pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f]
1132; SSE-NEXT:    ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1133; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1134;
1135; AVX1-LABEL: i32_shuf_XYW0:
1136; AVX1:       ## %bb.0:
1137; AVX1-NEXT:    vpermilps $244, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4]
1138; AVX1-NEXT:    ## xmm0 = xmm0[0,1,3,3]
1139; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1140; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1141; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1142; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1143;
1144; AVX512-LABEL: i32_shuf_XYW0:
1145; AVX512:       ## %bb.0:
1146; AVX512-NEXT:    vpermilps $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4]
1147; AVX512-NEXT:    ## xmm0 = xmm0[0,1,3,3]
1148; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1149; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1150; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1151; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1152  %vecext = extractelement <4 x i32> %x, i32 0
1153  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1154  %vecext1 = extractelement <4 x i32> %x, i32 1
1155  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
1156  %vecext2 = extractelement <4 x i32> %x, i32 3
1157  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
1158  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
1159  ret <4 x i32> %vecinit4
1160}
1161
1162define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
1163; SSE-LABEL: i32_shuf_W00W:
1164; SSE:       ## %bb.0:
1165; SSE-NEXT:    pshufd $231, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xe7]
1166; SSE-NEXT:    ## xmm1 = xmm0[3,1,2,3]
1167; SSE-NEXT:    pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0]
1168; SSE-NEXT:    pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3]
1169; SSE-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
1170; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1171;
1172; AVX1-LABEL: i32_shuf_W00W:
1173; AVX1:       ## %bb.0:
1174; AVX1-NEXT:    vpermilps $231, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
1175; AVX1-NEXT:    ## xmm0 = xmm0[3,1,2,3]
1176; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1177; AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1178; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1179; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1180;
1181; AVX512-LABEL: i32_shuf_W00W:
1182; AVX512:       ## %bb.0:
1183; AVX512-NEXT:    vpermilps $231, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xe7]
1184; AVX512-NEXT:    ## xmm0 = xmm0[3,1,2,3]
1185; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1186; AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1187; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1188; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1189  %vecext = extractelement <4 x i32> %x, i32 3
1190  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1191  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1192  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
1193  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
1194  ret <4 x i32> %vecinit4
1195}
1196
1197define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
1198; SSE-LABEL: i32_shuf_X00A:
1199; SSE:       ## %bb.0:
1200; SSE-NEXT:    pxor %xmm2, %xmm2 ## encoding: [0x66,0x0f,0xef,0xd2]
1201; SSE-NEXT:    pblendw $252, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0xfc]
1202; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
1203; SSE-NEXT:    pshufd $36, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x24]
1204; SSE-NEXT:    ## xmm1 = xmm1[0,1,2,0]
1205; SSE-NEXT:    pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0]
1206; SSE-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1207; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1208;
1209; AVX1-LABEL: i32_shuf_X00A:
1210; AVX1:       ## %bb.0:
1211; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1212; AVX1-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
1213; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
1214; AVX1-NEXT:    vpermilps $36, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x24]
1215; AVX1-NEXT:    ## xmm1 = xmm1[0,1,2,0]
1216; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1217; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1218; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1219;
1220; AVX512-LABEL: i32_shuf_X00A:
1221; AVX512:       ## %bb.0:
1222; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
1223; AVX512-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
1224; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
1225; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
1226; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
1227; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1228; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1229  %vecext = extractelement <4 x i32> %x, i32 0
1230  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1231  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1232  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
1233  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
1234  ret <4 x i32> %vecinit4
1235}
1236
1237define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
1238; SSE-LABEL: i32_shuf_X00X:
1239; SSE:       ## %bb.0:
1240; SSE-NEXT:    pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9]
1241; SSE-NEXT:    pshufd $36, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x24]
1242; SSE-NEXT:    ## xmm0 = xmm0[0,1,2,0]
1243; SSE-NEXT:    pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c]
1244; SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
1245; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1246;
1247; AVX1-LABEL: i32_shuf_X00X:
1248; AVX1:       ## %bb.0:
1249; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1250; AVX1-NEXT:    vpermilps $36, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x24]
1251; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2,0]
1252; AVX1-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1253; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1254; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1255;
1256; AVX512-LABEL: i32_shuf_X00X:
1257; AVX512:       ## %bb.0:
1258; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1259; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
1260; AVX512-NEXT:    vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06]
1261; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1262; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1263  %vecext = extractelement <4 x i32> %x, i32 0
1264  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1265  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1266  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
1267  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
1268  ret <4 x i32> %vecinit4
1269}
1270
1271define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
1272; SSE-LABEL: i32_shuf_X0YC:
1273; SSE:       ## %bb.0:
1274; SSE-NEXT:    pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0]
1275; SSE-NEXT:    ## xmm2 = xmm0[0],zero,xmm0[1],zero
1276; SSE-NEXT:    pshufd $164, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xa4]
1277; SSE-NEXT:    ## xmm0 = xmm1[0,1,2,2]
1278; SSE-NEXT:    pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f]
1279; SSE-NEXT:    ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
1280; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1281;
1282; AVX1-LABEL: i32_shuf_X0YC:
1283; AVX1:       ## %bb.0:
1284; AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0]
1285; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
1286; AVX1-NEXT:    vpshufd $164, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xa4]
1287; AVX1-NEXT:    ## xmm1 = xmm1[0,1,2,2]
1288; AVX1-NEXT:    vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0]
1289; AVX1-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1290; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1291;
1292; AVX512-LABEL: i32_shuf_X0YC:
1293; AVX512:       ## %bb.0:
1294; AVX512-NEXT:    vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0]
1295; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[1],zero
1296; AVX512-NEXT:    vpshufd $164, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xa4]
1297; AVX512-NEXT:    ## xmm1 = xmm1[0,1,2,2]
1298; AVX512-NEXT:    vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08]
1299; AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
1300; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1301  %vecext = extractelement <4 x i32> %x, i32 0
1302  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
1303  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
1304  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
1305  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
1306  ret <4 x i32> %vecinit5
1307}
1308
1309;; Test for a bug in the first implementation of LowerBuildVectorv4X86
1310define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
1311; SSE-LABEL: test_insertps_no_undef:
1312; SSE:       ## %bb.0:
1313; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
1314; SSE-NEXT:    blendps $7, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc8,0x07]
1315; SSE-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1316; SSE-NEXT:    maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1]
1317; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1318;
1319; AVX1-LABEL: test_insertps_no_undef:
1320; AVX1:       ## %bb.0:
1321; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
1322; AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08]
1323; AVX1-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1324; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1]
1325; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1326;
1327; AVX512-LABEL: test_insertps_no_undef:
1328; AVX512:       ## %bb.0:
1329; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
1330; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08]
1331; AVX512-NEXT:    ## xmm1 = xmm0[0,1,2],xmm1[3]
1332; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
1333; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1334  %vecext = extractelement <4 x float> %x, i32 0
1335  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1336  %vecext1 = extractelement <4 x float> %x, i32 1
1337  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1338  %vecext3 = extractelement <4 x float> %x, i32 2
1339  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
1340  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
1341  %mask = fcmp olt <4 x float> %vecinit5, %x
1342  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
1343  ret <4 x float> %res
1344}
1345
1346define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
1347; SSE-LABEL: blendvb_fallback:
1348; SSE:       ## %bb.0:
1349; SSE-NEXT:    psllw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x0f]
1350; SSE-NEXT:    psraw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x0f]
1351; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x38,0x10,0xd1]
1352; SSE-NEXT:    movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2]
1353; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1354;
1355; AVX1-LABEL: blendvb_fallback:
1356; AVX1:       ## %bb.0:
1357; AVX1-NEXT:    vpsllw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
1358; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x0f]
1359; AVX1-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00]
1360; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1361;
1362; AVX512-LABEL: blendvb_fallback:
1363; AVX512:       ## %bb.0:
1364; AVX512-NEXT:    vpsllw $15, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x0f]
1365; AVX512-NEXT:    vpmovw2m %xmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc8]
1366; AVX512-NEXT:    vpblendmw %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x66,0xc1]
1367; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1368  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
1369  ret <8 x i16> %ret
1370}
1371
1372; On X86, account for the argument's move to registers
1373define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
1374; X86-SSE-LABEL: insertps_from_vector_load:
1375; X86-SSE:       ## %bb.0:
1376; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1377; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1378; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1379; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1380; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1381;
1382; X86-AVX1-LABEL: insertps_from_vector_load:
1383; X86-AVX1:       ## %bb.0:
1384; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1385; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1386; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1387; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1388; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1389;
1390; X86-AVX512-LABEL: insertps_from_vector_load:
1391; X86-AVX512:       ## %bb.0:
1392; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1393; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1394; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1395; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1396; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1397;
1398; X64-SSE-LABEL: insertps_from_vector_load:
1399; X64-SSE:       ## %bb.0:
1400; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1401; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1402; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1403; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1404;
1405; X64-AVX1-LABEL: insertps_from_vector_load:
1406; X64-AVX1:       ## %bb.0:
1407; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1408; X64-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1409; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1410; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1411;
1412; X64-AVX512-LABEL: insertps_from_vector_load:
1413; X64-AVX512:       ## %bb.0:
1414; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1415; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1416; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1417; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1418  %1 = load <4 x float>, <4 x float>* %pb, align 16
1419  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
1420  ret <4 x float> %2
1421}
1422
1423;; Use a non-zero CountS for insertps
1424;; Try to match a bit more of the instr, since we need the load's offset.
1425define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
1426; X86-SSE-LABEL: insertps_from_vector_load_offset:
1427; X86-SSE:       ## %bb.0:
1428; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1429; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1430; X86-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
1431; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1432; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1433;
1434; X86-AVX1-LABEL: insertps_from_vector_load_offset:
1435; X86-AVX1:       ## %bb.0:
1436; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1437; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1438; X86-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1439; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1440; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1441;
1442; X86-AVX512-LABEL: insertps_from_vector_load_offset:
1443; X86-AVX512:       ## %bb.0:
1444; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1445; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1446; X86-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1447; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1448; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1449;
1450; X64-SSE-LABEL: insertps_from_vector_load_offset:
1451; X64-SSE:       ## %bb.0:
1452; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1453; X64-SSE-NEXT:    insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60]
1454; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1455; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1456;
1457; X64-AVX1-LABEL: insertps_from_vector_load_offset:
1458; X64-AVX1:       ## %bb.0:
1459; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1460; X64-AVX1-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1461; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1462; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1463;
1464; X64-AVX512-LABEL: insertps_from_vector_load_offset:
1465; X64-AVX512:       ## %bb.0:
1466; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1467; X64-AVX512-NEXT:    vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60]
1468; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
1469; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1470  %1 = load <4 x float>, <4 x float>* %pb, align 16
1471  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
1472  ret <4 x float> %2
1473}
1474
1475;; Try to match a bit more of the instr, since we need the load's offset.
1476define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
1477; X86-SSE-LABEL: insertps_from_vector_load_offset_2:
1478; X86-SSE:       ## %bb.0:
1479; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1480; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1481; X86-SSE-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1482; X86-SSE-NEXT:    movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08]
1483; X86-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
1484; X86-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1485; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1486;
1487; X86-AVX1-LABEL: insertps_from_vector_load_offset_2:
1488; X86-AVX1:       ## %bb.0:
1489; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1490; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1491; X86-AVX1-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1492; X86-AVX1-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08]
1493; X86-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1494; X86-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1495; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1496;
1497; X86-AVX512-LABEL: insertps_from_vector_load_offset_2:
1498; X86-AVX512:       ## %bb.0:
1499; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1500; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
1501; X86-AVX512-NEXT:    shll $4, %ecx ## encoding: [0xc1,0xe1,0x04]
1502; X86-AVX512-NEXT:    vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08]
1503; X86-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1504; X86-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1505; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1506;
1507; X64-SSE-LABEL: insertps_from_vector_load_offset_2:
1508; X64-SSE:       ## %bb.0:
1509; X64-SSE-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1510; X64-SSE-NEXT:    movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37]
1511; X64-SSE-NEXT:    insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0]
1512; X64-SSE-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1513; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1514;
1515; X64-AVX1-LABEL: insertps_from_vector_load_offset_2:
1516; X64-AVX1:       ## %bb.0:
1517; X64-AVX1-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1518; X64-AVX1-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37]
1519; X64-AVX1-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1520; X64-AVX1-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1521; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1522;
1523; X64-AVX512-LABEL: insertps_from_vector_load_offset_2:
1524; X64-AVX512:       ## %bb.0:
1525; X64-AVX512-NEXT:    shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04]
1526; X64-AVX512-NEXT:    vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37]
1527; X64-AVX512-NEXT:    vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0]
1528; X64-AVX512-NEXT:    ## xmm0 = xmm1[3],xmm0[1,2,3]
1529; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1530  %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
1531  %2 = load <4 x float>, <4 x float>* %1, align 16
1532  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
1533  ret <4 x float> %3
1534}
1535
1536define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
1537; X86-SSE-LABEL: insertps_from_broadcast_loadf32:
1538; X86-SSE:       ## %bb.0:
1539; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1540; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1541; X86-SSE-NEXT:    insertps $48, (%ecx,%eax,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0x81,0x30]
1542; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1543; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1544;
1545; X86-AVX1-LABEL: insertps_from_broadcast_loadf32:
1546; X86-AVX1:       ## %bb.0:
1547; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1548; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1549; X86-AVX1-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30]
1550; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1551; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1552;
1553; X86-AVX512-LABEL: insertps_from_broadcast_loadf32:
1554; X86-AVX512:       ## %bb.0:
1555; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1556; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1557; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0x81]
1558; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1559; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1560; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1561;
1562; X64-SSE-LABEL: insertps_from_broadcast_loadf32:
1563; X64-SSE:       ## %bb.0:
1564; X64-SSE-NEXT:    insertps $48, (%rdi,%rsi,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0xb7,0x30]
1565; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1566; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1567;
1568; X64-AVX1-LABEL: insertps_from_broadcast_loadf32:
1569; X64-AVX1:       ## %bb.0:
1570; X64-AVX1-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30]
1571; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1572; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1573;
1574; X64-AVX512-LABEL: insertps_from_broadcast_loadf32:
1575; X64-AVX512:       ## %bb.0:
1576; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0xb7]
1577; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1578; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1579; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1580  %1 = getelementptr inbounds float, float* %fb, i64 %index
1581  %2 = load float, float* %1, align 4
1582  %3 = insertelement <4 x float> undef, float %2, i32 0
1583  %4 = insertelement <4 x float> %3, float %2, i32 1
1584  %5 = insertelement <4 x float> %4, float %2, i32 2
1585  %6 = insertelement <4 x float> %5, float %2, i32 3
1586  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1587  ret <4 x float> %7
1588}
1589
1590define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
1591; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32:
1592; X86-SSE:       ## %bb.0:
1593; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1594; X86-SSE-NEXT:    movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08]
1595; X86-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1596; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1597; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1598;
1599; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
1600; X86-AVX1:       ## %bb.0:
1601; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1602; X86-AVX1-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
1603; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1604; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1605;
1606; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
1607; X86-AVX512:       ## %bb.0:
1608; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1609; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08]
1610; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1611; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1612; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1613;
1614; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
1615; X64-SSE:       ## %bb.0:
1616; X64-SSE-NEXT:    movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f]
1617; X64-SSE-NEXT:    insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30]
1618; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1619; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1620;
1621; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32:
1622; X64-AVX1:       ## %bb.0:
1623; X64-AVX1-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
1624; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
1625; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1626;
1627; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
1628; X64-AVX512:       ## %bb.0:
1629; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f]
1630; X64-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
1631; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[0]
1632; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1633  %1 = load <4 x float>, <4 x float>* %b, align 4
1634  %2 = extractelement <4 x float> %1, i32 0
1635  %3 = insertelement <4 x float> undef, float %2, i32 0
1636  %4 = insertelement <4 x float> %3, float %2, i32 1
1637  %5 = insertelement <4 x float> %4, float %2, i32 2
1638  %6 = insertelement <4 x float> %5, float %2, i32 3
1639  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1640  ret <4 x float> %7
1641}
1642
1643define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
1644; X86-SSE-LABEL: insertps_from_broadcast_multiple_use:
1645; X86-SSE:       ## %bb.0:
1646; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1647; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1648; X86-SSE-NEXT:    movss (%ecx,%eax,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0x81]
1649; X86-SSE-NEXT:    ## xmm4 = mem[0],zero,zero,zero
1650; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
1651; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1652; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
1653; X86-SSE-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1654; X86-SSE-NEXT:    addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
1655; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
1656; X86-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
1657; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
1658; X86-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
1659; X86-SSE-NEXT:    addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
1660; X86-SSE-NEXT:    addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
1661; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1662;
1663; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use:
1664; X86-AVX1:       ## %bb.0:
1665; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1666; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1667; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1668; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1669; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1670; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1671; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1672; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1673; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1674; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1675; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1676; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1677; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
1678; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1679; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1680;
1681; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use:
1682; X86-AVX512:       ## %bb.0:
1683; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
1684; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
1685; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
1686; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1687; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1688; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1689; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1690; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1691; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1692; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1693; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1694; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1695; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1696; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1697; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1698;
1699; X64-SSE-LABEL: insertps_from_broadcast_multiple_use:
1700; X64-SSE:       ## %bb.0:
1701; X64-SSE-NEXT:    movss (%rdi,%rsi,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0xb7]
1702; X64-SSE-NEXT:    ## xmm4 = mem[0],zero,zero,zero
1703; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
1704; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1705; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
1706; X64-SSE-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1707; X64-SSE-NEXT:    addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1]
1708; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30]
1709; X64-SSE-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[0]
1710; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30]
1711; X64-SSE-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[0]
1712; X64-SSE-NEXT:    addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda]
1713; X64-SSE-NEXT:    addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3]
1714; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1715;
1716; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
1717; X64-AVX1:       ## %bb.0:
1718; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1719; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1720; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1721; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1722; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1723; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1724; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1725; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1726; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1727; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1728; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
1729; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
1730; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1731;
1732; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
1733; X64-AVX512:       ## %bb.0:
1734; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
1735; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
1736; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
1737; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
1738; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
1739; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1740; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
1741; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
1742; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
1743; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
1744; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
1745; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
1746; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1747  %1 = getelementptr inbounds float, float* %fb, i64 %index
1748  %2 = load float, float* %1, align 4
1749  %3 = insertelement <4 x float> undef, float %2, i32 0
1750  %4 = insertelement <4 x float> %3, float %2, i32 1
1751  %5 = insertelement <4 x float> %4, float %2, i32 2
1752  %6 = insertelement <4 x float> %5, float %2, i32 3
1753  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
1754  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
1755  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
1756  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
1757  %11 = fadd <4 x float> %7, %8
1758  %12 = fadd <4 x float> %9, %10
1759  %13 = fadd <4 x float> %11, %12
1760  ret <4 x float> %13
1761}
1762
1763define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
1764; X86-SSE-LABEL: insertps_with_undefs:
1765; X86-SSE:       ## %bb.0:
1766; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1767; X86-SSE-NEXT:    movss (%eax), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x08]
1768; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1769; X86-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
1770; X86-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
1771; X86-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
1772; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1773;
1774; X86-AVX1-LABEL: insertps_with_undefs:
1775; X86-AVX1:       ## %bb.0:
1776; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1777; X86-AVX1-NEXT:    vmovss (%eax), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x08]
1778; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1779; X86-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
1780; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1781; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1782;
1783; X86-AVX512-LABEL: insertps_with_undefs:
1784; X86-AVX512:       ## %bb.0:
1785; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1786; X86-AVX512-NEXT:    vmovss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
1787; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1788; X86-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
1789; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1790; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1791;
1792; X64-SSE-LABEL: insertps_with_undefs:
1793; X64-SSE:       ## %bb.0:
1794; X64-SSE-NEXT:    movss (%rdi), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x0f]
1795; X64-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1796; X64-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
1797; X64-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
1798; X64-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
1799; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1800;
1801; X64-AVX1-LABEL: insertps_with_undefs:
1802; X64-AVX1:       ## %bb.0:
1803; X64-AVX1-NEXT:    vmovss (%rdi), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x0f]
1804; X64-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1805; X64-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
1806; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1807; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1808;
1809; X64-AVX512-LABEL: insertps_with_undefs:
1810; X64-AVX512:       ## %bb.0:
1811; X64-AVX512-NEXT:    vmovss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
1812; X64-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
1813; X64-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
1814; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
1815; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1816  %1 = load float, float* %b, align 4
1817  %2 = insertelement <4 x float> undef, float %1, i32 0
1818  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
1819  ret <4 x float> %result
1820}
1821
1822; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
1823; the destination index to change the load, instead of the source index.
1824define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
1825; X86-SSE-LABEL: pr20087:
1826; X86-SSE:       ## %bb.0:
1827; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1828; X86-SSE-NEXT:    movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08]
1829; X86-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
1830; X86-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1831; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1832;
1833; X86-AVX1-LABEL: pr20087:
1834; X86-AVX1:       ## %bb.0:
1835; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1836; X86-AVX1-NEXT:    vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08]
1837; X86-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1838; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1839; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1840;
1841; X86-AVX512-LABEL: pr20087:
1842; X86-AVX512:       ## %bb.0:
1843; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1844; X86-AVX512-NEXT:    vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08]
1845; X86-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1846; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1847; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1848;
1849; X64-SSE-LABEL: pr20087:
1850; X64-SSE:       ## %bb.0:
1851; X64-SSE-NEXT:    movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f]
1852; X64-SSE-NEXT:    insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2]
1853; X64-SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1854; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1855;
1856; X64-AVX1-LABEL: pr20087:
1857; X64-AVX1:       ## %bb.0:
1858; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f]
1859; X64-AVX1-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1860; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1861; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1862;
1863; X64-AVX512-LABEL: pr20087:
1864; X64-AVX512:       ## %bb.0:
1865; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
1866; X64-AVX512-NEXT:    vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2]
1867; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
1868; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1869  %load = load <4 x float> , <4 x float> *%ptr
1870  %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
1871  ret <4 x float> %ret
1872}
1873
1874; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
1875define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
1876; X86-SSE-LABEL: insertps_pr20411:
1877; X86-SSE:       ## %bb.0:
1878; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1879; X86-SSE-NEXT:    pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e]
1880; X86-SSE-NEXT:    ## xmm1 = xmm1[2,3,0,1]
1881; X86-SSE-NEXT:    pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
1882; X86-SSE-NEXT:    ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1883; X86-SSE-NEXT:    movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08]
1884; X86-SSE-NEXT:    retl ## encoding: [0xc3]
1885;
1886; X86-AVX1-LABEL: insertps_pr20411:
1887; X86-AVX1:       ## %bb.0:
1888; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1889; X86-AVX1-NEXT:    vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
1890; X86-AVX1-NEXT:    ## xmm1 = xmm1[2,3,0,1]
1891; X86-AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1892; X86-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1893; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00]
1894; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
1895;
1896; X86-AVX512-LABEL: insertps_pr20411:
1897; X86-AVX512:       ## %bb.0:
1898; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
1899; X86-AVX512-NEXT:    vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
1900; X86-AVX512-NEXT:    ## xmm1 = xmm1[2,3,0,1]
1901; X86-AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1902; X86-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1903; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
1904; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
1905;
1906; X64-SSE-LABEL: insertps_pr20411:
1907; X64-SSE:       ## %bb.0:
1908; X64-SSE-NEXT:    pshufd $78, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x4e]
1909; X64-SSE-NEXT:    ## xmm1 = xmm1[2,3,0,1]
1910; X64-SSE-NEXT:    pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3]
1911; X64-SSE-NEXT:    ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1912; X64-SSE-NEXT:    movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f]
1913; X64-SSE-NEXT:    retq ## encoding: [0xc3]
1914;
1915; X64-AVX1-LABEL: insertps_pr20411:
1916; X64-AVX1:       ## %bb.0:
1917; X64-AVX1-NEXT:    vpermilps $78, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
1918; X64-AVX1-NEXT:    ## xmm1 = xmm1[2,3,0,1]
1919; X64-AVX1-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1920; X64-AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1921; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07]
1922; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
1923;
1924; X64-AVX512-LABEL: insertps_pr20411:
1925; X64-AVX512:       ## %bb.0:
1926; X64-AVX512-NEXT:    vpermilps $78, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x4e]
1927; X64-AVX512-NEXT:    ## xmm1 = xmm1[2,3,0,1]
1928; X64-AVX512-NEXT:    vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02]
1929; X64-AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1930; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
1931; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
1932  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef>
1933  %ptrcast = bitcast i32* %RET to <4 x i32>*
1934  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
1935  ret void
1936}
1937
1938define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
1939; SSE-LABEL: insertps_4:
1940; SSE:       ## %bb.0:
1941; SSE-NEXT:    insertps $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xaa]
1942; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1943; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1944;
1945; AVX1-LABEL: insertps_4:
1946; AVX1:       ## %bb.0:
1947; AVX1-NEXT:    vinsertps $170, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa]
1948; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1949; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1950;
1951; AVX512-LABEL: insertps_4:
1952; AVX512:       ## %bb.0:
1953; AVX512-NEXT:    vinsertps $170, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa]
1954; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[2],zero
1955; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1956  %vecext = extractelement <4 x float> %A, i32 0
1957  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1958  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
1959  %vecext2 = extractelement <4 x float> %B, i32 2
1960  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
1961  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1962  ret <4 x float> %vecinit4
1963}
1964
1965define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
1966; SSE-LABEL: insertps_5:
1967; SSE:       ## %bb.0:
1968; SSE-NEXT:    insertps $92, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x5c]
1969; SSE-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1970; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1971;
1972; AVX1-LABEL: insertps_5:
1973; AVX1:       ## %bb.0:
1974; AVX1-NEXT:    vinsertps $92, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c]
1975; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[1],zero,zero
1976; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1977;
1978; AVX512-LABEL: insertps_5:
1979; AVX512:       ## %bb.0:
1980; AVX512-NEXT:    vpblendd $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x02]
1981; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1982; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
1983; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
1984; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
1985  %vecext = extractelement <4 x float> %A, i32 0
1986  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
1987  %vecext1 = extractelement <4 x float> %B, i32 1
1988  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
1989  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
1990  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
1991  ret <4 x float> %vecinit4
1992}
1993
1994define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
1995; SSE-LABEL: insertps_6:
1996; SSE:       ## %bb.0:
1997; SSE-NEXT:    insertps $169, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xa9]
1998; SSE-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
1999; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2000;
2001; AVX1-LABEL: insertps_6:
2002; AVX1:       ## %bb.0:
2003; AVX1-NEXT:    vinsertps $169, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9]
2004; AVX1-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
2005; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2006;
2007; AVX512-LABEL: insertps_6:
2008; AVX512:       ## %bb.0:
2009; AVX512-NEXT:    vinsertps $169, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9]
2010; AVX512-NEXT:    ## xmm0 = zero,xmm0[1],xmm1[2],zero
2011; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2012  %vecext = extractelement <4 x float> %A, i32 1
2013  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
2014  %vecext1 = extractelement <4 x float> %B, i32 2
2015  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
2016  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
2017  ret <4 x float> %vecinit3
2018}
2019
2020define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
2021; SSE-LABEL: insertps_7:
2022; SSE:       ## %bb.0:
2023; SSE-NEXT:    insertps $106, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x6a]
2024; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2025; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2026;
2027; AVX1-LABEL: insertps_7:
2028; AVX1:       ## %bb.0:
2029; AVX1-NEXT:    vinsertps $106, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a]
2030; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2031; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2032;
2033; AVX512-LABEL: insertps_7:
2034; AVX512:       ## %bb.0:
2035; AVX512-NEXT:    vinsertps $106, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a]
2036; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm1[1],zero
2037; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2038  %vecext = extractelement <4 x float> %A, i32 0
2039  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
2040  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
2041  %vecext2 = extractelement <4 x float> %B, i32 1
2042  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
2043  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
2044  ret <4 x float> %vecinit4
2045}
2046
2047define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
2048; SSE-LABEL: insertps_8:
2049; SSE:       ## %bb.0:
2050; SSE-NEXT:    insertps $28, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x1c]
2051; SSE-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2052; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2053;
2054; AVX1-LABEL: insertps_8:
2055; AVX1:       ## %bb.0:
2056; AVX1-NEXT:    vinsertps $28, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c]
2057; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2058; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2059;
2060; AVX512-LABEL: insertps_8:
2061; AVX512:       ## %bb.0:
2062; AVX512-NEXT:    vinsertps $28, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c]
2063; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm1[0],zero,zero
2064; AVX512-NEXT:    vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
2065; AVX512-NEXT:    ## xmm0 = xmm0[0],zero
2066; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2067  %vecext = extractelement <4 x float> %A, i32 0
2068  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
2069  %vecext1 = extractelement <4 x float> %B, i32 0
2070  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
2071  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
2072  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
2073  ret <4 x float> %vecinit4
2074}
2075
2076define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
2077; SSE-LABEL: insertps_9:
2078; SSE:       ## %bb.0:
2079; SSE-NEXT:    insertps $25, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xc8,0x19]
2080; SSE-NEXT:    ## xmm1 = zero,xmm0[0],xmm1[2],zero
2081; SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
2082; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2083;
2084; AVX1-LABEL: insertps_9:
2085; AVX1:       ## %bb.0:
2086; AVX1-NEXT:    vinsertps $25, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19]
2087; AVX1-NEXT:    ## xmm0 = zero,xmm0[0],xmm1[2],zero
2088; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2089;
2090; AVX512-LABEL: insertps_9:
2091; AVX512:       ## %bb.0:
2092; AVX512-NEXT:    vinsertps $25, %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19]
2093; AVX512-NEXT:    ## xmm0 = zero,xmm0[0],xmm1[2],zero
2094; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2095  %vecext = extractelement <4 x float> %A, i32 0
2096  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
2097  %vecext1 = extractelement <4 x float> %B, i32 2
2098  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
2099  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
2100  ret <4 x float> %vecinit3
2101}
2102
2103define <4 x float> @insertps_10(<4 x float> %A) {
2104; SSE-LABEL: insertps_10:
2105; SSE:       ## %bb.0:
2106; SSE-NEXT:    insertps $42, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x2a]
2107; SSE-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2108; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2109;
2110; AVX1-LABEL: insertps_10:
2111; AVX1:       ## %bb.0:
2112; AVX1-NEXT:    vinsertps $42, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a]
2113; AVX1-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2114; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2115;
2116; AVX512-LABEL: insertps_10:
2117; AVX512:       ## %bb.0:
2118; AVX512-NEXT:    vinsertps $42, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a]
2119; AVX512-NEXT:    ## xmm0 = xmm0[0],zero,xmm0[0],zero
2120; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2121  %vecext = extractelement <4 x float> %A, i32 0
2122  %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
2123  %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
2124  ret <4 x float> %vecbuild2
2125}
2126
2127define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
2128; SSE-LABEL: build_vector_to_shuffle_1:
2129; SSE:       ## %bb.0:
2130; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
2131; SSE-NEXT:    blendps $5, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x05]
2132; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2133; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2134;
2135; AVX1-LABEL: build_vector_to_shuffle_1:
2136; AVX1:       ## %bb.0:
2137; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
2138; AVX1-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
2139; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2140; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2141;
2142; AVX512-LABEL: build_vector_to_shuffle_1:
2143; AVX512:       ## %bb.0:
2144; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
2145; AVX512-NEXT:    vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a]
2146; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2147; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2148  %vecext = extractelement <4 x float> %A, i32 1
2149  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
2150  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
2151  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2152  ret <4 x float> %vecinit3
2153}
2154
2155define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
2156; SSE-LABEL: build_vector_to_shuffle_2:
2157; SSE:       ## %bb.0:
2158; SSE-NEXT:    xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9]
2159; SSE-NEXT:    blendps $13, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x0d]
2160; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2161; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2162;
2163; AVX1-LABEL: build_vector_to_shuffle_2:
2164; AVX1:       ## %bb.0:
2165; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
2166; AVX1-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
2167; AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2168; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2169;
2170; AVX512-LABEL: build_vector_to_shuffle_2:
2171; AVX512:       ## %bb.0:
2172; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9]
2173; AVX512-NEXT:    vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02]
2174; AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2175; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
2176  %vecext = extractelement <4 x float> %A, i32 1
2177  %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
2178  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
2179  ret <4 x float> %vecinit1
2180}
2181