• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
8
9define <8 x i32> @test_broadcasti128(<8 x i32> %a0, <4 x i32> *%a1) {
10; GENERIC-LABEL: test_broadcasti128:
11; GENERIC:       # %bb.0:
12; GENERIC-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [6:1.00]
13; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
14; GENERIC-NEXT:    retq # sched: [1:1.00]
15;
16; HASWELL-LABEL: test_broadcasti128:
17; HASWELL:       # %bb.0:
18; HASWELL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
19; HASWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
20; HASWELL-NEXT:    retq # sched: [7:1.00]
21;
22; BROADWELL-LABEL: test_broadcasti128:
23; BROADWELL:       # %bb.0:
24; BROADWELL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [6:0.50]
25; BROADWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
26; BROADWELL-NEXT:    retq # sched: [7:1.00]
27;
28; SKYLAKE-LABEL: test_broadcasti128:
29; SKYLAKE:       # %bb.0:
30; SKYLAKE-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
31; SKYLAKE-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
32; SKYLAKE-NEXT:    retq # sched: [7:1.00]
33;
34; SKX-LABEL: test_broadcasti128:
35; SKX:       # %bb.0:
36; SKX-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
37; SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
38; SKX-NEXT:    retq # sched: [7:1.00]
39;
40; ZNVER1-LABEL: test_broadcasti128:
41; ZNVER1:       # %bb.0:
42; ZNVER1-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [8:0.50]
43; ZNVER1-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
44; ZNVER1-NEXT:    retq # sched: [1:0.50]
45  %1 = load <4 x i32>, <4 x i32> *%a1, align 16
46  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
47  %3 = add <8 x i32> %2, %a0
48  ret <8 x i32> %3
49}
50
51define <4 x double> @test_broadcastsd_ymm(<2 x double> %a0) {
52; GENERIC-LABEL: test_broadcastsd_ymm:
53; GENERIC:       # %bb.0:
54; GENERIC-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [1:1.00]
55; GENERIC-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
56; GENERIC-NEXT:    retq # sched: [1:1.00]
57;
58; HASWELL-LABEL: test_broadcastsd_ymm:
59; HASWELL:       # %bb.0:
60; HASWELL-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
61; HASWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
62; HASWELL-NEXT:    retq # sched: [7:1.00]
63;
64; BROADWELL-LABEL: test_broadcastsd_ymm:
65; BROADWELL:       # %bb.0:
66; BROADWELL-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
67; BROADWELL-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
68; BROADWELL-NEXT:    retq # sched: [7:1.00]
69;
70; SKYLAKE-LABEL: test_broadcastsd_ymm:
71; SKYLAKE:       # %bb.0:
72; SKYLAKE-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
73; SKYLAKE-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
74; SKYLAKE-NEXT:    retq # sched: [7:1.00]
75;
76; SKX-LABEL: test_broadcastsd_ymm:
77; SKX:       # %bb.0:
78; SKX-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
79; SKX-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
80; SKX-NEXT:    retq # sched: [7:1.00]
81;
82; ZNVER1-LABEL: test_broadcastsd_ymm:
83; ZNVER1:       # %bb.0:
84; ZNVER1-NEXT:    vbroadcastsd %xmm0, %ymm0 # sched: [100:0.25]
85; ZNVER1-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
86; ZNVER1-NEXT:    retq # sched: [1:0.50]
87  %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
88  %2 = fadd <4 x double> %1, %1
89  ret <4 x double> %2
90}
91
92define <4 x float> @test_broadcastss(<4 x float> %a0) {
93; GENERIC-LABEL: test_broadcastss:
94; GENERIC:       # %bb.0:
95; GENERIC-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
96; GENERIC-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
97; GENERIC-NEXT:    retq # sched: [1:1.00]
98;
99; HASWELL-LABEL: test_broadcastss:
100; HASWELL:       # %bb.0:
101; HASWELL-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
102; HASWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
103; HASWELL-NEXT:    retq # sched: [7:1.00]
104;
105; BROADWELL-LABEL: test_broadcastss:
106; BROADWELL:       # %bb.0:
107; BROADWELL-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
108; BROADWELL-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
109; BROADWELL-NEXT:    retq # sched: [7:1.00]
110;
111; SKYLAKE-LABEL: test_broadcastss:
112; SKYLAKE:       # %bb.0:
113; SKYLAKE-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
114; SKYLAKE-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
115; SKYLAKE-NEXT:    retq # sched: [7:1.00]
116;
117; SKX-LABEL: test_broadcastss:
118; SKX:       # %bb.0:
119; SKX-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
120; SKX-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
121; SKX-NEXT:    retq # sched: [7:1.00]
122;
123; ZNVER1-LABEL: test_broadcastss:
124; ZNVER1:       # %bb.0:
125; ZNVER1-NEXT:    vbroadcastss %xmm0, %xmm0 # sched: [1:0.50]
126; ZNVER1-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
127; ZNVER1-NEXT:    retq # sched: [1:0.50]
128  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
129  %2 = fadd <4 x float> %1, %1
130  ret <4 x float> %2
131}
132
133define <8 x float> @test_broadcastss_ymm(<4 x float> %a0) {
134; GENERIC-LABEL: test_broadcastss_ymm:
135; GENERIC:       # %bb.0:
136; GENERIC-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [1:1.00]
137; GENERIC-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
138; GENERIC-NEXT:    retq # sched: [1:1.00]
139;
140; HASWELL-LABEL: test_broadcastss_ymm:
141; HASWELL:       # %bb.0:
142; HASWELL-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
143; HASWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
144; HASWELL-NEXT:    retq # sched: [7:1.00]
145;
146; BROADWELL-LABEL: test_broadcastss_ymm:
147; BROADWELL:       # %bb.0:
148; BROADWELL-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
149; BROADWELL-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
150; BROADWELL-NEXT:    retq # sched: [7:1.00]
151;
152; SKYLAKE-LABEL: test_broadcastss_ymm:
153; SKYLAKE:       # %bb.0:
154; SKYLAKE-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
155; SKYLAKE-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
156; SKYLAKE-NEXT:    retq # sched: [7:1.00]
157;
158; SKX-LABEL: test_broadcastss_ymm:
159; SKX:       # %bb.0:
160; SKX-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
161; SKX-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
162; SKX-NEXT:    retq # sched: [7:1.00]
163;
164; ZNVER1-LABEL: test_broadcastss_ymm:
165; ZNVER1:       # %bb.0:
166; ZNVER1-NEXT:    vbroadcastss %xmm0, %ymm0 # sched: [100:0.25]
167; ZNVER1-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
168; ZNVER1-NEXT:    retq # sched: [1:0.50]
169  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
170  %2 = fadd <8 x float> %1, %1
171  ret <8 x float> %2
172}
173
174define <4 x i32> @test_extracti128(<8 x i32> %a0, <8 x i32> %a1, <4 x i32> *%a2) {
175; GENERIC-LABEL: test_extracti128:
176; GENERIC:       # %bb.0:
177; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
178; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
179; GENERIC-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [1:1.00]
180; GENERIC-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
181; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
182; GENERIC-NEXT:    retq # sched: [1:1.00]
183;
184; HASWELL-LABEL: test_extracti128:
185; HASWELL:       # %bb.0:
186; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
187; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
188; HASWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
189; HASWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
190; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
191; HASWELL-NEXT:    retq # sched: [7:1.00]
192;
193; BROADWELL-LABEL: test_extracti128:
194; BROADWELL:       # %bb.0:
195; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
196; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
197; BROADWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
198; BROADWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
199; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
200; BROADWELL-NEXT:    retq # sched: [7:1.00]
201;
202; SKYLAKE-LABEL: test_extracti128:
203; SKYLAKE:       # %bb.0:
204; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
205; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
206; SKYLAKE-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
207; SKYLAKE-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
208; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
209; SKYLAKE-NEXT:    retq # sched: [7:1.00]
210;
211; SKX-LABEL: test_extracti128:
212; SKX:       # %bb.0:
213; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
214; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
215; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
216; SKX-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
217; SKX-NEXT:    vzeroupper # sched: [4:1.00]
218; SKX-NEXT:    retq # sched: [7:1.00]
219;
220; ZNVER1-LABEL: test_extracti128:
221; ZNVER1:       # %bb.0:
222; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.25]
223; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
224; ZNVER1-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [2:0.25]
225; ZNVER1-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:0.50]
226; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
227; ZNVER1-NEXT:    retq # sched: [1:0.50]
228  %1 = add <8 x i32> %a0, %a1
229  %2 = sub <8 x i32> %a0, %a1
230  %3 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
231  %4 = shufflevector <8 x i32> %2, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
232  store <4 x i32> %3, <4 x i32> *%a2
233  ret <4 x i32> %4
234}
235
236define <2 x double> @test_gatherdpd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3) {
237; GENERIC-LABEL: test_gatherdpd:
238; GENERIC:       # %bb.0:
239; GENERIC-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
240; GENERIC-NEXT:    retq # sched: [1:1.00]
241;
242; HASWELL-LABEL: test_gatherdpd:
243; HASWELL:       # %bb.0:
244; HASWELL-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
245; HASWELL-NEXT:    retq # sched: [7:1.00]
246;
247; BROADWELL-LABEL: test_gatherdpd:
248; BROADWELL:       # %bb.0:
249; BROADWELL-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.00]
250; BROADWELL-NEXT:    retq # sched: [7:1.00]
251;
252; SKYLAKE-LABEL: test_gatherdpd:
253; SKYLAKE:       # %bb.0:
254; SKYLAKE-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
255; SKYLAKE-NEXT:    retq # sched: [7:1.00]
256;
257; SKX-LABEL: test_gatherdpd:
258; SKX:       # %bb.0:
259; SKX-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
260; SKX-NEXT:    retq # sched: [7:1.00]
261;
262; ZNVER1-LABEL: test_gatherdpd:
263; ZNVER1:       # %bb.0:
264; ZNVER1-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
265; ZNVER1-NEXT:    retq # sched: [1:0.50]
266  %1 = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3, i8 2)
267  ret <2 x double> %1
268}
269declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
270
271define <4 x double> @test_gatherdpd_ymm(<4 x double> %a0, i8* %a1, <4 x i32> %a2, <4 x double> %a3) {
272; GENERIC-LABEL: test_gatherdpd_ymm:
273; GENERIC:       # %bb.0:
274; GENERIC-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [5:0.50]
275; GENERIC-NEXT:    retq # sched: [1:1.00]
276;
277; HASWELL-LABEL: test_gatherdpd_ymm:
278; HASWELL:       # %bb.0:
279; HASWELL-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [27:4.00]
280; HASWELL-NEXT:    retq # sched: [7:1.00]
281;
282; BROADWELL-LABEL: test_gatherdpd_ymm:
283; BROADWELL:       # %bb.0:
284; BROADWELL-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [26:5.00]
285; BROADWELL-NEXT:    retq # sched: [7:1.00]
286;
287; SKYLAKE-LABEL: test_gatherdpd_ymm:
288; SKYLAKE:       # %bb.0:
289; SKYLAKE-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [25:1.00]
290; SKYLAKE-NEXT:    retq # sched: [7:1.00]
291;
292; SKX-LABEL: test_gatherdpd_ymm:
293; SKX:       # %bb.0:
294; SKX-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [25:1.00]
295; SKX-NEXT:    retq # sched: [7:1.00]
296;
297; ZNVER1-LABEL: test_gatherdpd_ymm:
298; ZNVER1:       # %bb.0:
299; ZNVER1-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [100:0.25]
300; ZNVER1-NEXT:    retq # sched: [1:0.50]
301  %1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %a1, <4 x i32> %a2, <4 x double> %a3, i8 8)
302  ret <4 x double> %1
303}
304declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
305
306define <4 x float> @test_gatherdps(<4 x float> %a0, i8* %a1, <4 x i32> %a2, <4 x float> %a3) {
307; GENERIC-LABEL: test_gatherdps:
308; GENERIC:       # %bb.0:
309; GENERIC-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
310; GENERIC-NEXT:    retq # sched: [1:1.00]
311;
312; HASWELL-LABEL: test_gatherdps:
313; HASWELL:       # %bb.0:
314; HASWELL-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.67]
315; HASWELL-NEXT:    retq # sched: [7:1.00]
316;
317; BROADWELL-LABEL: test_gatherdps:
318; BROADWELL:       # %bb.0:
319; BROADWELL-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.00]
320; BROADWELL-NEXT:    retq # sched: [7:1.00]
321;
322; SKYLAKE-LABEL: test_gatherdps:
323; SKYLAKE:       # %bb.0:
324; SKYLAKE-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
325; SKYLAKE-NEXT:    retq # sched: [7:1.00]
326;
327; SKX-LABEL: test_gatherdps:
328; SKX:       # %bb.0:
329; SKX-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
330; SKX-NEXT:    retq # sched: [7:1.00]
331;
332; ZNVER1-LABEL: test_gatherdps:
333; ZNVER1:       # %bb.0:
334; ZNVER1-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
335; ZNVER1-NEXT:    retq # sched: [1:0.50]
336  %1 = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %a1, <4 x i32> %a2, <4 x float> %a3, i8 2)
337  ret <4 x float> %1
338}
339declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
340
341define <8 x float> @test_gatherdps_ymm(<8 x float> %a0, i8* %a1, <8 x i32> %a2, <8 x float> %a3) {
342; GENERIC-LABEL: test_gatherdps_ymm:
343; GENERIC:       # %bb.0:
344; GENERIC-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [5:0.50]
345; GENERIC-NEXT:    retq # sched: [1:1.00]
346;
347; HASWELL-LABEL: test_gatherdps_ymm:
348; HASWELL:       # %bb.0:
349; HASWELL-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [27:6.50]
350; HASWELL-NEXT:    retq # sched: [7:1.00]
351;
352; BROADWELL-LABEL: test_gatherdps_ymm:
353; BROADWELL:       # %bb.0:
354; BROADWELL-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [26:4.00]
355; BROADWELL-NEXT:    retq # sched: [7:1.00]
356;
357; SKYLAKE-LABEL: test_gatherdps_ymm:
358; SKYLAKE:       # %bb.0:
359; SKYLAKE-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [25:1.00]
360; SKYLAKE-NEXT:    retq # sched: [7:1.00]
361;
362; SKX-LABEL: test_gatherdps_ymm:
363; SKX:       # %bb.0:
364; SKX-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [25:1.00]
365; SKX-NEXT:    retq # sched: [7:1.00]
366;
367; ZNVER1-LABEL: test_gatherdps_ymm:
368; ZNVER1:       # %bb.0:
369; ZNVER1-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [100:0.25]
370; ZNVER1-NEXT:    retq # sched: [1:0.50]
371  %1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %a1, <8 x i32> %a2, <8 x float> %a3, i8 4)
372  ret <8 x float> %1
373}
374declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
375
376define <2 x double> @test_gatherqpd(<2 x double> %a0, i8* %a1, <2 x i64> %a2, <2 x double> %a3) {
377; GENERIC-LABEL: test_gatherqpd:
378; GENERIC:       # %bb.0:
379; GENERIC-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
380; GENERIC-NEXT:    retq # sched: [1:1.00]
381;
382; HASWELL-LABEL: test_gatherqpd:
383; HASWELL:       # %bb.0:
384; HASWELL-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [23:3.33]
385; HASWELL-NEXT:    retq # sched: [7:1.00]
386;
387; BROADWELL-LABEL: test_gatherqpd:
388; BROADWELL:       # %bb.0:
389; BROADWELL-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:3.00]
390; BROADWELL-NEXT:    retq # sched: [7:1.00]
391;
392; SKYLAKE-LABEL: test_gatherqpd:
393; SKYLAKE:       # %bb.0:
394; SKYLAKE-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
395; SKYLAKE-NEXT:    retq # sched: [7:1.00]
396;
397; SKX-LABEL: test_gatherqpd:
398; SKX:       # %bb.0:
399; SKX-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
400; SKX-NEXT:    retq # sched: [7:1.00]
401;
402; ZNVER1-LABEL: test_gatherqpd:
403; ZNVER1:       # %bb.0:
404; ZNVER1-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
405; ZNVER1-NEXT:    retq # sched: [1:0.50]
406  %1 = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %a1, <2 x i64> %a2, <2 x double> %a3, i8 2)
407  ret <2 x double> %1
408}
409declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
410
411define <4 x double> @test_gatherqpd_ymm(<4 x double> %a0, i8* %a1, <4 x i64> %a2, <4 x double> %a3) {
412; GENERIC-LABEL: test_gatherqpd_ymm:
413; GENERIC:       # %bb.0:
414; GENERIC-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [5:0.50]
415; GENERIC-NEXT:    retq # sched: [1:1.00]
416;
417; HASWELL-LABEL: test_gatherqpd_ymm:
418; HASWELL:       # %bb.0:
419; HASWELL-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [24:5.00]
420; HASWELL-NEXT:    retq # sched: [7:1.00]
421;
422; BROADWELL-LABEL: test_gatherqpd_ymm:
423; BROADWELL:       # %bb.0:
424; BROADWELL-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [23:3.00]
425; BROADWELL-NEXT:    retq # sched: [7:1.00]
426;
427; SKYLAKE-LABEL: test_gatherqpd_ymm:
428; SKYLAKE:       # %bb.0:
429; SKYLAKE-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [25:1.00]
430; SKYLAKE-NEXT:    retq # sched: [7:1.00]
431;
432; SKX-LABEL: test_gatherqpd_ymm:
433; SKX:       # %bb.0:
434; SKX-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [25:1.00]
435; SKX-NEXT:    retq # sched: [7:1.00]
436;
437; ZNVER1-LABEL: test_gatherqpd_ymm:
438; ZNVER1:       # %bb.0:
439; ZNVER1-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [100:0.25]
440; ZNVER1-NEXT:    retq # sched: [1:0.50]
441  %1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %a1, <4 x i64> %a2, <4 x double> %a3, i8 8)
442  ret <4 x double> %1
443}
444declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
445
446define <4 x float> @test_gatherqps(<4 x float> %a0, i8* %a1, <2 x i64> %a2, <4 x float> %a3) {
447; GENERIC-LABEL: test_gatherqps:
448; GENERIC:       # %bb.0:
449; GENERIC-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
450; GENERIC-NEXT:    retq # sched: [1:1.00]
451;
452; HASWELL-LABEL: test_gatherqps:
453; HASWELL:       # %bb.0:
454; HASWELL-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.67]
455; HASWELL-NEXT:    retq # sched: [7:1.00]
456;
457; BROADWELL-LABEL: test_gatherqps:
458; BROADWELL:       # %bb.0:
459; BROADWELL-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [27:5.00]
460; BROADWELL-NEXT:    retq # sched: [7:1.00]
461;
462; SKYLAKE-LABEL: test_gatherqps:
463; SKYLAKE:       # %bb.0:
464; SKYLAKE-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
465; SKYLAKE-NEXT:    retq # sched: [7:1.00]
466;
467; SKX-LABEL: test_gatherqps:
468; SKX:       # %bb.0:
469; SKX-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
470; SKX-NEXT:    retq # sched: [7:1.00]
471;
472; ZNVER1-LABEL: test_gatherqps:
473; ZNVER1:       # %bb.0:
474; ZNVER1-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
475; ZNVER1-NEXT:    retq # sched: [1:0.50]
476  %1 = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %a1, <2 x i64> %a2, <4 x float> %a3, i8 2)
477  ret <4 x float> %1
478}
479declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
480
481define <4 x float> @test_gatherqps_ymm(<4 x float> %a0, i8* %a1, <4 x i64> %a2, <4 x float> %a3) {
482; GENERIC-LABEL: test_gatherqps_ymm:
483; GENERIC:       # %bb.0:
484; GENERIC-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [5:0.50]
485; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
486; GENERIC-NEXT:    retq # sched: [1:1.00]
487;
488; HASWELL-LABEL: test_gatherqps_ymm:
489; HASWELL:       # %bb.0:
490; HASWELL-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [28:3.67]
491; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
492; HASWELL-NEXT:    retq # sched: [7:1.00]
493;
494; BROADWELL-LABEL: test_gatherqps_ymm:
495; BROADWELL:       # %bb.0:
496; BROADWELL-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [24:5.00]
497; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
498; BROADWELL-NEXT:    retq # sched: [7:1.00]
499;
500; SKYLAKE-LABEL: test_gatherqps_ymm:
501; SKYLAKE:       # %bb.0:
502; SKYLAKE-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
503; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
504; SKYLAKE-NEXT:    retq # sched: [7:1.00]
505;
506; SKX-LABEL: test_gatherqps_ymm:
507; SKX:       # %bb.0:
508; SKX-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
509; SKX-NEXT:    vzeroupper # sched: [4:1.00]
510; SKX-NEXT:    retq # sched: [7:1.00]
511;
512; ZNVER1-LABEL: test_gatherqps_ymm:
513; ZNVER1:       # %bb.0:
514; ZNVER1-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [100:0.25]
515; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
516; ZNVER1-NEXT:    retq # sched: [1:0.50]
517  %1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %a1, <4 x i64> %a2, <4 x float> %a3, i8 4)
518  ret <4 x float> %1
519}
520declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
521
522define <8 x i32> @test_inserti128(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
523; GENERIC-LABEL: test_inserti128:
524; GENERIC:       # %bb.0:
525; GENERIC-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
526; GENERIC-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
527; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
528; GENERIC-NEXT:    retq # sched: [1:1.00]
529;
530; HASWELL-LABEL: test_inserti128:
531; HASWELL:       # %bb.0:
532; HASWELL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
533; HASWELL-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
534; HASWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
535; HASWELL-NEXT:    retq # sched: [7:1.00]
536;
537; BROADWELL-LABEL: test_inserti128:
538; BROADWELL:       # %bb.0:
539; BROADWELL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
540; BROADWELL-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:0.50]
541; BROADWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
542; BROADWELL-NEXT:    retq # sched: [7:1.00]
543;
544; SKYLAKE-LABEL: test_inserti128:
545; SKYLAKE:       # %bb.0:
546; SKYLAKE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
547; SKYLAKE-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
548; SKYLAKE-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
549; SKYLAKE-NEXT:    retq # sched: [7:1.00]
550;
551; SKX-LABEL: test_inserti128:
552; SKX:       # %bb.0:
553; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
554; SKX-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
555; SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
556; SKX-NEXT:    retq # sched: [7:1.00]
557;
558; ZNVER1-LABEL: test_inserti128:
559; ZNVER1:       # %bb.0:
560; ZNVER1-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [2:0.25]
561; ZNVER1-NEXT:    vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
562; ZNVER1-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
563; ZNVER1-NEXT:    retq # sched: [1:0.50]
564  %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
565  %2 = shufflevector <8 x i32> %a0, <8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
566  %3 = load <4 x i32>, <4 x i32> *%a2, align 16
567  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
568  %5 = shufflevector <8 x i32> %a0, <8 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
569  %6 = add <8 x i32> %2, %5
570  ret <8 x i32> %6
571}
572
573define <4 x i64> @test_movntdqa(i8* %a0) {
574; GENERIC-LABEL: test_movntdqa:
575; GENERIC:       # %bb.0:
576; GENERIC-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
577; GENERIC-NEXT:    retq # sched: [1:1.00]
578;
579; HASWELL-LABEL: test_movntdqa:
580; HASWELL:       # %bb.0:
581; HASWELL-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
582; HASWELL-NEXT:    retq # sched: [7:1.00]
583;
584; BROADWELL-LABEL: test_movntdqa:
585; BROADWELL:       # %bb.0:
586; BROADWELL-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [6:0.50]
587; BROADWELL-NEXT:    retq # sched: [7:1.00]
588;
589; SKYLAKE-LABEL: test_movntdqa:
590; SKYLAKE:       # %bb.0:
591; SKYLAKE-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
592; SKYLAKE-NEXT:    retq # sched: [7:1.00]
593;
594; SKX-LABEL: test_movntdqa:
595; SKX:       # %bb.0:
596; SKX-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
597; SKX-NEXT:    retq # sched: [7:1.00]
598;
599; ZNVER1-LABEL: test_movntdqa:
600; ZNVER1:       # %bb.0:
601; ZNVER1-NEXT:    vmovntdqa (%rdi), %ymm0 # sched: [8:0.50]
602; ZNVER1-NEXT:    retq # sched: [1:0.50]
603  %1 = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0)
604  ret <4 x i64> %1
605}
606declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
607
608define <16 x i16> @test_mpsadbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
609; GENERIC-LABEL: test_mpsadbw:
610; GENERIC:       # %bb.0:
611; GENERIC-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:1.00]
612; GENERIC-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [14:1.00]
613; GENERIC-NEXT:    retq # sched: [1:1.00]
614;
615; HASWELL-LABEL: test_mpsadbw:
616; HASWELL:       # %bb.0:
617; HASWELL-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:2.00]
618; HASWELL-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00]
619; HASWELL-NEXT:    retq # sched: [7:1.00]
620;
621; BROADWELL-LABEL: test_mpsadbw:
622; BROADWELL:       # %bb.0:
623; BROADWELL-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:2.00]
624; BROADWELL-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
625; BROADWELL-NEXT:    retq # sched: [7:1.00]
626;
627; SKYLAKE-LABEL: test_mpsadbw:
628; SKYLAKE:       # %bb.0:
629; SKYLAKE-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
630; SKYLAKE-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
631; SKYLAKE-NEXT:    retq # sched: [7:1.00]
632;
633; SKX-LABEL: test_mpsadbw:
634; SKX:       # %bb.0:
635; SKX-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
636; SKX-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
637; SKX-NEXT:    retq # sched: [7:1.00]
638;
639; ZNVER1-LABEL: test_mpsadbw:
640; ZNVER1:       # %bb.0:
641; ZNVER1-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
642; ZNVER1-NEXT:    vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
643; ZNVER1-NEXT:    retq # sched: [1:0.50]
644  %1 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7)
645  %2 = bitcast <16 x i16> %1 to <32 x i8>
646  %3 = load <32 x i8>, <32 x i8> *%a2, align 32
647  %4 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %2, <32 x i8> %3, i8 7)
648  ret <16 x i16> %4
649}
650declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
651
652define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
653; GENERIC-LABEL: test_pabsb:
654; GENERIC:       # %bb.0:
655; GENERIC-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
656; GENERIC-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
657; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
658; GENERIC-NEXT:    retq # sched: [1:1.00]
659;
660; HASWELL-LABEL: test_pabsb:
661; HASWELL:       # %bb.0:
662; HASWELL-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
663; HASWELL-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
664; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
665; HASWELL-NEXT:    retq # sched: [7:1.00]
666;
667; BROADWELL-LABEL: test_pabsb:
668; BROADWELL:       # %bb.0:
669; BROADWELL-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
670; BROADWELL-NEXT:    vpabsb (%rdi), %ymm1 # sched: [7:0.50]
671; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
672; BROADWELL-NEXT:    retq # sched: [7:1.00]
673;
674; SKYLAKE-LABEL: test_pabsb:
675; SKYLAKE:       # %bb.0:
676; SKYLAKE-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
677; SKYLAKE-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
678; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
679; SKYLAKE-NEXT:    retq # sched: [7:1.00]
680;
681; SKX-LABEL: test_pabsb:
682; SKX:       # %bb.0:
683; SKX-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.50]
684; SKX-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
685; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
686; SKX-NEXT:    retq # sched: [7:1.00]
687;
688; ZNVER1-LABEL: test_pabsb:
689; ZNVER1:       # %bb.0:
690; ZNVER1-NEXT:    vpabsb (%rdi), %ymm1 # sched: [8:0.50]
691; ZNVER1-NEXT:    vpabsb %ymm0, %ymm0 # sched: [1:0.25]
692; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
693; ZNVER1-NEXT:    retq # sched: [1:0.50]
694  %1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
695  %2 = load <32 x i8>, <32 x i8> *%a1, align 32
696  %3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2)
697  %4 = or <32 x i8> %1, %3
698  ret <32 x i8> %4
699}
700declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
701
702define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
703; GENERIC-LABEL: test_pabsd:
704; GENERIC:       # %bb.0:
705; GENERIC-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
706; GENERIC-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
707; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
708; GENERIC-NEXT:    retq # sched: [1:1.00]
709;
710; HASWELL-LABEL: test_pabsd:
711; HASWELL:       # %bb.0:
712; HASWELL-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
713; HASWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
714; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
715; HASWELL-NEXT:    retq # sched: [7:1.00]
716;
717; BROADWELL-LABEL: test_pabsd:
718; BROADWELL:       # %bb.0:
719; BROADWELL-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
720; BROADWELL-NEXT:    vpabsd (%rdi), %ymm1 # sched: [7:0.50]
721; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
722; BROADWELL-NEXT:    retq # sched: [7:1.00]
723;
724; SKYLAKE-LABEL: test_pabsd:
725; SKYLAKE:       # %bb.0:
726; SKYLAKE-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
727; SKYLAKE-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
728; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
729; SKYLAKE-NEXT:    retq # sched: [7:1.00]
730;
731; SKX-LABEL: test_pabsd:
732; SKX:       # %bb.0:
733; SKX-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.50]
734; SKX-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
735; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
736; SKX-NEXT:    retq # sched: [7:1.00]
737;
738; ZNVER1-LABEL: test_pabsd:
739; ZNVER1:       # %bb.0:
740; ZNVER1-NEXT:    vpabsd (%rdi), %ymm1 # sched: [8:0.50]
741; ZNVER1-NEXT:    vpabsd %ymm0, %ymm0 # sched: [1:0.25]
742; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
743; ZNVER1-NEXT:    retq # sched: [1:0.50]
744  %1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
745  %2 = load <8 x i32>, <8 x i32> *%a1, align 32
746  %3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2)
747  %4 = or <8 x i32> %1, %3
748  ret <8 x i32> %4
749}
750declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
751
752define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
753; GENERIC-LABEL: test_pabsw:
754; GENERIC:       # %bb.0:
755; GENERIC-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
756; GENERIC-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
757; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
758; GENERIC-NEXT:    retq # sched: [1:1.00]
759;
760; HASWELL-LABEL: test_pabsw:
761; HASWELL:       # %bb.0:
762; HASWELL-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
763; HASWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
764; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
765; HASWELL-NEXT:    retq # sched: [7:1.00]
766;
767; BROADWELL-LABEL: test_pabsw:
768; BROADWELL:       # %bb.0:
769; BROADWELL-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
770; BROADWELL-NEXT:    vpabsw (%rdi), %ymm1 # sched: [7:0.50]
771; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
772; BROADWELL-NEXT:    retq # sched: [7:1.00]
773;
774; SKYLAKE-LABEL: test_pabsw:
775; SKYLAKE:       # %bb.0:
776; SKYLAKE-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
777; SKYLAKE-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
778; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
779; SKYLAKE-NEXT:    retq # sched: [7:1.00]
780;
781; SKX-LABEL: test_pabsw:
782; SKX:       # %bb.0:
783; SKX-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.50]
784; SKX-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
785; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
786; SKX-NEXT:    retq # sched: [7:1.00]
787;
788; ZNVER1-LABEL: test_pabsw:
789; ZNVER1:       # %bb.0:
790; ZNVER1-NEXT:    vpabsw (%rdi), %ymm1 # sched: [8:0.50]
791; ZNVER1-NEXT:    vpabsw %ymm0, %ymm0 # sched: [1:0.25]
792; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
793; ZNVER1-NEXT:    retq # sched: [1:0.50]
794  %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
795  %2 = load <16 x i16>, <16 x i16> *%a1, align 32
796  %3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2)
797  %4 = or <16 x i16> %1, %3
798  ret <16 x i16> %4
799}
800declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
801
802define <16 x i16> @test_packssdw(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
803; GENERIC-LABEL: test_packssdw:
804; GENERIC:       # %bb.0:
805; GENERIC-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
806; GENERIC-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
807; GENERIC-NEXT:    retq # sched: [1:1.00]
808;
809; HASWELL-LABEL: test_packssdw:
810; HASWELL:       # %bb.0:
811; HASWELL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
812; HASWELL-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
813; HASWELL-NEXT:    retq # sched: [7:1.00]
814;
815; BROADWELL-LABEL: test_packssdw:
816; BROADWELL:       # %bb.0:
817; BROADWELL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
818; BROADWELL-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
819; BROADWELL-NEXT:    retq # sched: [7:1.00]
820;
821; SKYLAKE-LABEL: test_packssdw:
822; SKYLAKE:       # %bb.0:
823; SKYLAKE-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
824; SKYLAKE-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
825; SKYLAKE-NEXT:    retq # sched: [7:1.00]
826;
827; SKX-LABEL: test_packssdw:
828; SKX:       # %bb.0:
829; SKX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
830; SKX-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
831; SKX-NEXT:    retq # sched: [7:1.00]
832;
833; ZNVER1-LABEL: test_packssdw:
834; ZNVER1:       # %bb.0:
835; ZNVER1-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
836; ZNVER1-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
837; ZNVER1-NEXT:    retq # sched: [1:0.50]
838  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
839  %2 = bitcast <16 x i16> %1 to <8 x i32>
840  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
841  %4 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %2, <8 x i32> %3)
842  ret <16 x i16> %4
843}
844declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
845
846define <32 x i8> @test_packsswb(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
847; GENERIC-LABEL: test_packsswb:
848; GENERIC:       # %bb.0:
849; GENERIC-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
850; GENERIC-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
851; GENERIC-NEXT:    retq # sched: [1:1.00]
852;
853; HASWELL-LABEL: test_packsswb:
854; HASWELL:       # %bb.0:
855; HASWELL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
856; HASWELL-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
857; HASWELL-NEXT:    retq # sched: [7:1.00]
858;
859; BROADWELL-LABEL: test_packsswb:
860; BROADWELL:       # %bb.0:
861; BROADWELL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
862; BROADWELL-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
863; BROADWELL-NEXT:    retq # sched: [7:1.00]
864;
865; SKYLAKE-LABEL: test_packsswb:
866; SKYLAKE:       # %bb.0:
867; SKYLAKE-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
868; SKYLAKE-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
869; SKYLAKE-NEXT:    retq # sched: [7:1.00]
870;
871; SKX-LABEL: test_packsswb:
872; SKX:       # %bb.0:
873; SKX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
874; SKX-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
875; SKX-NEXT:    retq # sched: [7:1.00]
876;
877; ZNVER1-LABEL: test_packsswb:
878; ZNVER1:       # %bb.0:
879; ZNVER1-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
880; ZNVER1-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
881; ZNVER1-NEXT:    retq # sched: [1:0.50]
882  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
883  %2 = bitcast <32 x i8> %1 to <16 x i16>
884  %3 = load <16 x i16>, <16 x i16> *%a2, align 32
885  %4 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %2, <16 x i16> %3)
886  ret <32 x i8> %4
887}
888declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
889
890define <16 x i16> @test_packusdw(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
891; GENERIC-LABEL: test_packusdw:
892; GENERIC:       # %bb.0:
893; GENERIC-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
894; GENERIC-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
895; GENERIC-NEXT:    retq # sched: [1:1.00]
896;
897; HASWELL-LABEL: test_packusdw:
898; HASWELL:       # %bb.0:
899; HASWELL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
900; HASWELL-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
901; HASWELL-NEXT:    retq # sched: [7:1.00]
902;
903; BROADWELL-LABEL: test_packusdw:
904; BROADWELL:       # %bb.0:
905; BROADWELL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
906; BROADWELL-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
907; BROADWELL-NEXT:    retq # sched: [7:1.00]
908;
909; SKYLAKE-LABEL: test_packusdw:
910; SKYLAKE:       # %bb.0:
911; SKYLAKE-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
912; SKYLAKE-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
913; SKYLAKE-NEXT:    retq # sched: [7:1.00]
914;
915; SKX-LABEL: test_packusdw:
916; SKX:       # %bb.0:
917; SKX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
918; SKX-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
919; SKX-NEXT:    retq # sched: [7:1.00]
920;
921; ZNVER1-LABEL: test_packusdw:
922; ZNVER1:       # %bb.0:
923; ZNVER1-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
924; ZNVER1-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
925; ZNVER1-NEXT:    retq # sched: [1:0.50]
926  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
927  %2 = bitcast <16 x i16> %1 to <8 x i32>
928  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
929  %4 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %2, <8 x i32> %3)
930  ret <16 x i16> %4
931}
932declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
933
934define <32 x i8> @test_packuswb(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
935; GENERIC-LABEL: test_packuswb:
936; GENERIC:       # %bb.0:
937; GENERIC-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
938; GENERIC-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
939; GENERIC-NEXT:    retq # sched: [1:1.00]
940;
941; HASWELL-LABEL: test_packuswb:
942; HASWELL:       # %bb.0:
943; HASWELL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
944; HASWELL-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
945; HASWELL-NEXT:    retq # sched: [7:1.00]
946;
947; BROADWELL-LABEL: test_packuswb:
948; BROADWELL:       # %bb.0:
949; BROADWELL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
950; BROADWELL-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
951; BROADWELL-NEXT:    retq # sched: [7:1.00]
952;
953; SKYLAKE-LABEL: test_packuswb:
954; SKYLAKE:       # %bb.0:
955; SKYLAKE-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
956; SKYLAKE-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
957; SKYLAKE-NEXT:    retq # sched: [7:1.00]
958;
959; SKX-LABEL: test_packuswb:
960; SKX:       # %bb.0:
961; SKX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
962; SKX-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
963; SKX-NEXT:    retq # sched: [7:1.00]
964;
965; ZNVER1-LABEL: test_packuswb:
966; ZNVER1:       # %bb.0:
967; ZNVER1-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
968; ZNVER1-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
969; ZNVER1-NEXT:    retq # sched: [1:0.50]
970  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
971  %2 = bitcast <32 x i8> %1 to <16 x i16>
972  %3 = load <16 x i16>, <16 x i16> *%a2, align 32
973  %4 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %2, <16 x i16> %3)
974  ret <32 x i8> %4
975}
976declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
977
978define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
979; GENERIC-LABEL: test_paddb:
980; GENERIC:       # %bb.0:
981; GENERIC-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
982; GENERIC-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
983; GENERIC-NEXT:    retq # sched: [1:1.00]
984;
985; HASWELL-LABEL: test_paddb:
986; HASWELL:       # %bb.0:
987; HASWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
988; HASWELL-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
989; HASWELL-NEXT:    retq # sched: [7:1.00]
990;
991; BROADWELL-LABEL: test_paddb:
992; BROADWELL:       # %bb.0:
993; BROADWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
994; BROADWELL-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
995; BROADWELL-NEXT:    retq # sched: [7:1.00]
996;
997; SKYLAKE-LABEL: test_paddb:
998; SKYLAKE:       # %bb.0:
999; SKYLAKE-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1000; SKYLAKE-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1001; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1002;
1003; SKX-LABEL: test_paddb:
1004; SKX:       # %bb.0:
1005; SKX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1006; SKX-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1007; SKX-NEXT:    retq # sched: [7:1.00]
1008;
1009; ZNVER1-LABEL: test_paddb:
1010; ZNVER1:       # %bb.0:
1011; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1012; ZNVER1-NEXT:    vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1013; ZNVER1-NEXT:    retq # sched: [1:0.50]
1014  %1 = add <32 x i8> %a0, %a1
1015  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
1016  %3 = add <32 x i8> %1, %2
1017  ret <32 x i8> %3
1018}
1019
1020define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
1021; GENERIC-LABEL: test_paddd:
1022; GENERIC:       # %bb.0:
1023; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1024; GENERIC-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1025; GENERIC-NEXT:    retq # sched: [1:1.00]
1026;
1027; HASWELL-LABEL: test_paddd:
1028; HASWELL:       # %bb.0:
1029; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1030; HASWELL-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1031; HASWELL-NEXT:    retq # sched: [7:1.00]
1032;
1033; BROADWELL-LABEL: test_paddd:
1034; BROADWELL:       # %bb.0:
1035; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1036; BROADWELL-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1037; BROADWELL-NEXT:    retq # sched: [7:1.00]
1038;
1039; SKYLAKE-LABEL: test_paddd:
1040; SKYLAKE:       # %bb.0:
1041; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1042; SKYLAKE-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1043; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1044;
1045; SKX-LABEL: test_paddd:
1046; SKX:       # %bb.0:
1047; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1048; SKX-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1049; SKX-NEXT:    retq # sched: [7:1.00]
1050;
1051; ZNVER1-LABEL: test_paddd:
1052; ZNVER1:       # %bb.0:
1053; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1054; ZNVER1-NEXT:    vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1055; ZNVER1-NEXT:    retq # sched: [1:0.50]
1056  %1 = add <8 x i32> %a0, %a1
1057  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
1058  %3 = add <8 x i32> %1, %2
1059  ret <8 x i32> %3
1060}
1061
1062define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
1063; GENERIC-LABEL: test_paddq:
1064; GENERIC:       # %bb.0:
1065; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1066; GENERIC-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1067; GENERIC-NEXT:    retq # sched: [1:1.00]
1068;
1069; HASWELL-LABEL: test_paddq:
1070; HASWELL:       # %bb.0:
1071; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1072; HASWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1073; HASWELL-NEXT:    retq # sched: [7:1.00]
1074;
1075; BROADWELL-LABEL: test_paddq:
1076; BROADWELL:       # %bb.0:
1077; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1078; BROADWELL-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1079; BROADWELL-NEXT:    retq # sched: [7:1.00]
1080;
1081; SKYLAKE-LABEL: test_paddq:
1082; SKYLAKE:       # %bb.0:
1083; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1084; SKYLAKE-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1085; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1086;
1087; SKX-LABEL: test_paddq:
1088; SKX:       # %bb.0:
1089; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1090; SKX-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1091; SKX-NEXT:    retq # sched: [7:1.00]
1092;
1093; ZNVER1-LABEL: test_paddq:
1094; ZNVER1:       # %bb.0:
1095; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1096; ZNVER1-NEXT:    vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1097; ZNVER1-NEXT:    retq # sched: [1:0.50]
1098  %1 = add <4 x i64> %a0, %a1
1099  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
1100  %3 = add <4 x i64> %1, %2
1101  ret <4 x i64> %3
1102}
1103
1104define <32 x i8> @test_paddsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
1105; GENERIC-LABEL: test_paddsb:
1106; GENERIC:       # %bb.0:
1107; GENERIC-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1108; GENERIC-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1109; GENERIC-NEXT:    retq # sched: [1:1.00]
1110;
1111; HASWELL-LABEL: test_paddsb:
1112; HASWELL:       # %bb.0:
1113; HASWELL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1114; HASWELL-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1115; HASWELL-NEXT:    retq # sched: [7:1.00]
1116;
1117; BROADWELL-LABEL: test_paddsb:
1118; BROADWELL:       # %bb.0:
1119; BROADWELL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1120; BROADWELL-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1121; BROADWELL-NEXT:    retq # sched: [7:1.00]
1122;
1123; SKYLAKE-LABEL: test_paddsb:
1124; SKYLAKE:       # %bb.0:
1125; SKYLAKE-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1126; SKYLAKE-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1127; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1128;
1129; SKX-LABEL: test_paddsb:
1130; SKX:       # %bb.0:
1131; SKX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1132; SKX-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1133; SKX-NEXT:    retq # sched: [7:1.00]
1134;
1135; ZNVER1-LABEL: test_paddsb:
1136; ZNVER1:       # %bb.0:
1137; ZNVER1-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1138; ZNVER1-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1139; ZNVER1-NEXT:    retq # sched: [1:0.50]
1140  %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1)
1141  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
1142  %3 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %1, <32 x i8> %2)
1143  ret <32 x i8> %3
1144}
1145declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
1146
1147define <16 x i16> @test_paddsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
1148; GENERIC-LABEL: test_paddsw:
1149; GENERIC:       # %bb.0:
1150; GENERIC-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1151; GENERIC-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1152; GENERIC-NEXT:    retq # sched: [1:1.00]
1153;
1154; HASWELL-LABEL: test_paddsw:
1155; HASWELL:       # %bb.0:
1156; HASWELL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1157; HASWELL-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1158; HASWELL-NEXT:    retq # sched: [7:1.00]
1159;
1160; BROADWELL-LABEL: test_paddsw:
1161; BROADWELL:       # %bb.0:
1162; BROADWELL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1163; BROADWELL-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1164; BROADWELL-NEXT:    retq # sched: [7:1.00]
1165;
1166; SKYLAKE-LABEL: test_paddsw:
1167; SKYLAKE:       # %bb.0:
1168; SKYLAKE-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1169; SKYLAKE-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1170; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1171;
1172; SKX-LABEL: test_paddsw:
1173; SKX:       # %bb.0:
1174; SKX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1175; SKX-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1176; SKX-NEXT:    retq # sched: [7:1.00]
1177;
1178; ZNVER1-LABEL: test_paddsw:
1179; ZNVER1:       # %bb.0:
1180; ZNVER1-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1181; ZNVER1-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1182; ZNVER1-NEXT:    retq # sched: [1:0.50]
1183  %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1)
1184  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
1185  %3 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %1, <16 x i16> %2)
1186  ret <16 x i16> %3
1187}
1188declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
1189
1190define <32 x i8> @test_paddusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
1191; GENERIC-LABEL: test_paddusb:
1192; GENERIC:       # %bb.0:
1193; GENERIC-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1194; GENERIC-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1195; GENERIC-NEXT:    retq # sched: [1:1.00]
1196;
1197; HASWELL-LABEL: test_paddusb:
1198; HASWELL:       # %bb.0:
1199; HASWELL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1200; HASWELL-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1201; HASWELL-NEXT:    retq # sched: [7:1.00]
1202;
1203; BROADWELL-LABEL: test_paddusb:
1204; BROADWELL:       # %bb.0:
1205; BROADWELL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1206; BROADWELL-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1207; BROADWELL-NEXT:    retq # sched: [7:1.00]
1208;
1209; SKYLAKE-LABEL: test_paddusb:
1210; SKYLAKE:       # %bb.0:
1211; SKYLAKE-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1212; SKYLAKE-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1213; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1214;
1215; SKX-LABEL: test_paddusb:
1216; SKX:       # %bb.0:
1217; SKX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1218; SKX-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1219; SKX-NEXT:    retq # sched: [7:1.00]
1220;
1221; ZNVER1-LABEL: test_paddusb:
1222; ZNVER1:       # %bb.0:
1223; ZNVER1-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1224; ZNVER1-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1225; ZNVER1-NEXT:    retq # sched: [1:0.50]
1226  %1 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1)
1227  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
1228  %3 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %1, <32 x i8> %2)
1229  ret <32 x i8> %3
1230}
1231declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
1232
1233define <16 x i16> @test_paddusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
1234; GENERIC-LABEL: test_paddusw:
1235; GENERIC:       # %bb.0:
1236; GENERIC-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1237; GENERIC-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1238; GENERIC-NEXT:    retq # sched: [1:1.00]
1239;
1240; HASWELL-LABEL: test_paddusw:
1241; HASWELL:       # %bb.0:
1242; HASWELL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1243; HASWELL-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1244; HASWELL-NEXT:    retq # sched: [7:1.00]
1245;
1246; BROADWELL-LABEL: test_paddusw:
1247; BROADWELL:       # %bb.0:
1248; BROADWELL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1249; BROADWELL-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1250; BROADWELL-NEXT:    retq # sched: [7:1.00]
1251;
1252; SKYLAKE-LABEL: test_paddusw:
1253; SKYLAKE:       # %bb.0:
1254; SKYLAKE-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1255; SKYLAKE-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1256; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1257;
1258; SKX-LABEL: test_paddusw:
1259; SKX:       # %bb.0:
1260; SKX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1261; SKX-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1262; SKX-NEXT:    retq # sched: [7:1.00]
1263;
1264; ZNVER1-LABEL: test_paddusw:
1265; ZNVER1:       # %bb.0:
1266; ZNVER1-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1267; ZNVER1-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1268; ZNVER1-NEXT:    retq # sched: [1:0.50]
1269  %1 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1)
1270  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
1271  %3 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %1, <16 x i16> %2)
1272  ret <16 x i16> %3
1273}
1274declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
1275
1276define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
1277; GENERIC-LABEL: test_paddw:
1278; GENERIC:       # %bb.0:
1279; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1280; GENERIC-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1281; GENERIC-NEXT:    retq # sched: [1:1.00]
1282;
1283; HASWELL-LABEL: test_paddw:
1284; HASWELL:       # %bb.0:
1285; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1286; HASWELL-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1287; HASWELL-NEXT:    retq # sched: [7:1.00]
1288;
1289; BROADWELL-LABEL: test_paddw:
1290; BROADWELL:       # %bb.0:
1291; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1292; BROADWELL-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1293; BROADWELL-NEXT:    retq # sched: [7:1.00]
1294;
1295; SKYLAKE-LABEL: test_paddw:
1296; SKYLAKE:       # %bb.0:
1297; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1298; SKYLAKE-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1299; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1300;
1301; SKX-LABEL: test_paddw:
1302; SKX:       # %bb.0:
1303; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1304; SKX-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1305; SKX-NEXT:    retq # sched: [7:1.00]
1306;
1307; ZNVER1-LABEL: test_paddw:
1308; ZNVER1:       # %bb.0:
1309; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1310; ZNVER1-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1311; ZNVER1-NEXT:    retq # sched: [1:0.50]
1312  %1 = add <16 x i16> %a0, %a1
1313  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
1314  %3 = add <16 x i16> %1, %2
1315  ret <16 x i16> %3
1316}
1317
1318define <32 x i8> @test_palignr(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
1319; GENERIC-LABEL: test_palignr:
1320; GENERIC:       # %bb.0:
1321; GENERIC-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
1322; GENERIC-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
1323; GENERIC-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
1324; GENERIC-NEXT:    retq # sched: [1:1.00]
1325;
1326; HASWELL-LABEL: test_palignr:
1327; HASWELL:       # %bb.0:
1328; HASWELL-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
1329; HASWELL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
1330; HASWELL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
1331; HASWELL-NEXT:    retq # sched: [7:1.00]
1332;
1333; BROADWELL-LABEL: test_palignr:
1334; BROADWELL:       # %bb.0:
1335; BROADWELL-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
1336; BROADWELL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
1337; BROADWELL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
1338; BROADWELL-NEXT:    retq # sched: [7:1.00]
1339;
1340; SKYLAKE-LABEL: test_palignr:
1341; SKYLAKE:       # %bb.0:
1342; SKYLAKE-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
1343; SKYLAKE-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
1344; SKYLAKE-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
1345; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1346;
1347; SKX-LABEL: test_palignr:
1348; SKX:       # %bb.0:
1349; SKX-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
1350; SKX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:1.00]
1351; SKX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
1352; SKX-NEXT:    retq # sched: [7:1.00]
1353;
1354; ZNVER1-LABEL: test_palignr:
1355; ZNVER1:       # %bb.0:
1356; ZNVER1-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:0.25]
1357; ZNVER1-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] sched: [1:0.25]
1358; ZNVER1-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
1359; ZNVER1-NEXT:    retq # sched: [1:0.50]
1360  %1 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
1361  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
1362  %3 = shufflevector <32 x i8> %a0, <32 x i8> %1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
1363  %4 = add <32 x i8> %1, %3
1364  ret <32 x i8> %4
1365}
1366
1367define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
1368; GENERIC-LABEL: test_pand:
1369; GENERIC:       # %bb.0:
1370; GENERIC-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1371; GENERIC-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1372; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1373; GENERIC-NEXT:    retq # sched: [1:1.00]
1374;
1375; HASWELL-LABEL: test_pand:
1376; HASWELL:       # %bb.0:
1377; HASWELL-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1378; HASWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1379; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1380; HASWELL-NEXT:    retq # sched: [7:1.00]
1381;
1382; BROADWELL-LABEL: test_pand:
1383; BROADWELL:       # %bb.0:
1384; BROADWELL-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1385; BROADWELL-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1386; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1387; BROADWELL-NEXT:    retq # sched: [7:1.00]
1388;
1389; SKYLAKE-LABEL: test_pand:
1390; SKYLAKE:       # %bb.0:
1391; SKYLAKE-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1392; SKYLAKE-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1393; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1394; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1395;
1396; SKX-LABEL: test_pand:
1397; SKX:       # %bb.0:
1398; SKX-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1399; SKX-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1400; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1401; SKX-NEXT:    retq # sched: [7:1.00]
1402;
1403; ZNVER1-LABEL: test_pand:
1404; ZNVER1:       # %bb.0:
1405; ZNVER1-NEXT:    vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1406; ZNVER1-NEXT:    vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1407; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1408; ZNVER1-NEXT:    retq # sched: [1:0.50]
1409  %1 = and <4 x i64> %a0, %a1
1410  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
1411  %3 = and <4 x i64> %1, %2
1412  %4 = add <4 x i64> %3, %a1
1413  ret <4 x i64> %4
1414}
1415
1416define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
1417; GENERIC-LABEL: test_pandn:
1418; GENERIC:       # %bb.0:
1419; GENERIC-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1420; GENERIC-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
1421; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1422; GENERIC-NEXT:    retq # sched: [1:1.00]
1423;
1424; HASWELL-LABEL: test_pandn:
1425; HASWELL:       # %bb.0:
1426; HASWELL-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1427; HASWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
1428; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1429; HASWELL-NEXT:    retq # sched: [7:1.00]
1430;
1431; BROADWELL-LABEL: test_pandn:
1432; BROADWELL:       # %bb.0:
1433; BROADWELL-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1434; BROADWELL-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [7:0.50]
1435; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1436; BROADWELL-NEXT:    retq # sched: [7:1.00]
1437;
1438; SKYLAKE-LABEL: test_pandn:
1439; SKYLAKE:       # %bb.0:
1440; SKYLAKE-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1441; SKYLAKE-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
1442; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1443; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1444;
1445; SKX-LABEL: test_pandn:
1446; SKX:       # %bb.0:
1447; SKX-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1448; SKX-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
1449; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1450; SKX-NEXT:    retq # sched: [7:1.00]
1451;
1452; ZNVER1-LABEL: test_pandn:
1453; ZNVER1:       # %bb.0:
1454; ZNVER1-NEXT:    vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1455; ZNVER1-NEXT:    vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
1456; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1457; ZNVER1-NEXT:    retq # sched: [1:0.50]
1458  %1 = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
1459  %2 = and <4 x i64> %a1, %1
1460  %3 = load <4 x i64>, <4 x i64> *%a2, align 32
1461  %4 = xor <4 x i64> %2, <i64 -1, i64 -1, i64 -1, i64 -1>
1462  %5 = and <4 x i64> %3, %4
1463  %6 = add <4 x i64> %2, %5
1464  ret <4 x i64> %6
1465}
1466
1467define <32 x i8> @test_pavgb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
1468; GENERIC-LABEL: test_pavgb:
1469; GENERIC:       # %bb.0:
1470; GENERIC-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1471; GENERIC-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1472; GENERIC-NEXT:    retq # sched: [1:1.00]
1473;
1474; HASWELL-LABEL: test_pavgb:
1475; HASWELL:       # %bb.0:
1476; HASWELL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1477; HASWELL-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1478; HASWELL-NEXT:    retq # sched: [7:1.00]
1479;
1480; BROADWELL-LABEL: test_pavgb:
1481; BROADWELL:       # %bb.0:
1482; BROADWELL-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1483; BROADWELL-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1484; BROADWELL-NEXT:    retq # sched: [7:1.00]
1485;
1486; SKYLAKE-LABEL: test_pavgb:
1487; SKYLAKE:       # %bb.0:
1488; SKYLAKE-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1489; SKYLAKE-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1490; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1491;
1492; SKX-LABEL: test_pavgb:
1493; SKX:       # %bb.0:
1494; SKX-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1495; SKX-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1496; SKX-NEXT:    retq # sched: [7:1.00]
1497;
1498; ZNVER1-LABEL: test_pavgb:
1499; ZNVER1:       # %bb.0:
1500; ZNVER1-NEXT:    vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1501; ZNVER1-NEXT:    vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1502; ZNVER1-NEXT:    retq # sched: [1:0.50]
1503  %1 = zext <32 x i8> %a0 to <32 x i16>
1504  %2 = zext <32 x i8> %a1 to <32 x i16>
1505  %3 = add <32 x i16> %1, %2
1506  %4 = add <32 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1507  %5 = lshr <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1508  %6 = trunc <32 x i16> %5 to <32 x i8>
1509  %7 = load <32 x i8>, <32 x i8> *%a2, align 32
1510  %8 = zext <32 x i8> %6 to <32 x i16>
1511  %9 = zext <32 x i8> %7 to <32 x i16>
1512  %10 = add <32 x i16> %8, %9
1513  %11 = add <32 x i16> %10, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1514  %12 = lshr <32 x i16> %11, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1515  %13 = trunc <32 x i16> %12 to <32 x i8>
1516  ret <32 x i8> %13
1517}
1518
1519define <16 x i16> @test_pavgw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
1520; GENERIC-LABEL: test_pavgw:
1521; GENERIC:       # %bb.0:
1522; GENERIC-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1523; GENERIC-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1524; GENERIC-NEXT:    retq # sched: [1:1.00]
1525;
1526; HASWELL-LABEL: test_pavgw:
1527; HASWELL:       # %bb.0:
1528; HASWELL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1529; HASWELL-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1530; HASWELL-NEXT:    retq # sched: [7:1.00]
1531;
1532; BROADWELL-LABEL: test_pavgw:
1533; BROADWELL:       # %bb.0:
1534; BROADWELL-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1535; BROADWELL-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
1536; BROADWELL-NEXT:    retq # sched: [7:1.00]
1537;
1538; SKYLAKE-LABEL: test_pavgw:
1539; SKYLAKE:       # %bb.0:
1540; SKYLAKE-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1541; SKYLAKE-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1542; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1543;
1544; SKX-LABEL: test_pavgw:
1545; SKX:       # %bb.0:
1546; SKX-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1547; SKX-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1548; SKX-NEXT:    retq # sched: [7:1.00]
1549;
1550; ZNVER1-LABEL: test_pavgw:
1551; ZNVER1:       # %bb.0:
1552; ZNVER1-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1553; ZNVER1-NEXT:    vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
1554; ZNVER1-NEXT:    retq # sched: [1:0.50]
1555  %1 = zext <16 x i16> %a0 to <16 x i32>
1556  %2 = zext <16 x i16> %a1 to <16 x i32>
1557  %3 = add <16 x i32> %1, %2
1558  %4 = add <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1559  %5 = lshr <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1560  %6 = trunc <16 x i32> %5 to <16 x i16>
1561  %7 = load <16 x i16>, <16 x i16> *%a2, align 32
1562  %8 = zext <16 x i16> %6 to <16 x i32>
1563  %9 = zext <16 x i16> %7 to <16 x i32>
1564  %10 = add <16 x i32> %8, %9
1565  %11 = add <16 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1566  %12 = lshr <16 x i32> %11, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1567  %13 = trunc <16 x i32> %12 to <16 x i16>
1568  ret <16 x i16> %13
1569}
1570
1571define <4 x i32> @test_pblendd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
1572; GENERIC-LABEL: test_pblendd:
1573; GENERIC:       # %bb.0:
1574; GENERIC-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.50]
1575; GENERIC-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
1576; GENERIC-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1577; GENERIC-NEXT:    retq # sched: [1:1.00]
1578;
1579; HASWELL-LABEL: test_pblendd:
1580; HASWELL:       # %bb.0:
1581; HASWELL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
1582; HASWELL-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
1583; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1584; HASWELL-NEXT:    retq # sched: [7:1.00]
1585;
1586; BROADWELL-LABEL: test_pblendd:
1587; BROADWELL:       # %bb.0:
1588; BROADWELL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
1589; BROADWELL-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [6:0.50]
1590; BROADWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1591; BROADWELL-NEXT:    retq # sched: [7:1.00]
1592;
1593; SKYLAKE-LABEL: test_pblendd:
1594; SKYLAKE:       # %bb.0:
1595; SKYLAKE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
1596; SKYLAKE-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
1597; SKYLAKE-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1598; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1599;
1600; SKX-LABEL: test_pblendd:
1601; SKX:       # %bb.0:
1602; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
1603; SKX-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
1604; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1605; SKX-NEXT:    retq # sched: [7:1.00]
1606;
1607; ZNVER1-LABEL: test_pblendd:
1608; ZNVER1:       # %bb.0:
1609; ZNVER1-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] sched: [1:0.50]
1610; ZNVER1-NEXT:    vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [8:1.00]
1611; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
1612; ZNVER1-NEXT:    retq # sched: [1:0.50]
1613  %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
1614  %2 = load <4 x i32>, <4 x i32> *%a2, align 16
1615  %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1616  %4 = add <4 x i32> %1, %3
1617  ret <4 x i32> %4
1618}
1619
1620define <8 x i32> @test_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
1621; GENERIC-LABEL: test_pblendd_ymm:
1622; GENERIC:       # %bb.0:
1623; GENERIC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50]
1624; GENERIC-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
1625; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1626; GENERIC-NEXT:    retq # sched: [1:1.00]
1627;
1628; HASWELL-LABEL: test_pblendd_ymm:
1629; HASWELL:       # %bb.0:
1630; HASWELL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
1631; HASWELL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
1632; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1633; HASWELL-NEXT:    retq # sched: [7:1.00]
1634;
1635; BROADWELL-LABEL: test_pblendd_ymm:
1636; BROADWELL:       # %bb.0:
1637; BROADWELL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
1638; BROADWELL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [7:0.50]
1639; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1640; BROADWELL-NEXT:    retq # sched: [7:1.00]
1641;
1642; SKYLAKE-LABEL: test_pblendd_ymm:
1643; SKYLAKE:       # %bb.0:
1644; SKYLAKE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
1645; SKYLAKE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
1646; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1647; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1648;
1649; SKX-LABEL: test_pblendd_ymm:
1650; SKX:       # %bb.0:
1651; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
1652; SKX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
1653; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1654; SKX-NEXT:    retq # sched: [7:1.00]
1655;
1656; ZNVER1-LABEL: test_pblendd_ymm:
1657; ZNVER1:       # %bb.0:
1658; ZNVER1-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50]
1659; ZNVER1-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [9:1.50]
1660; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1661; ZNVER1-NEXT:    retq # sched: [1:0.50]
1662  %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
1663  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
1664  %3 = shufflevector <8 x i32> %a1, <8 x i32> %2, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
1665  %4 = add <8 x i32> %1, %3
1666  ret <8 x i32> %4
1667}
1668
1669define <32 x i8> @test_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2, <32 x i8> *%a3, <32 x i8> %a4) {
1670; GENERIC-LABEL: test_pblendvb:
1671; GENERIC:       # %bb.0:
1672; GENERIC-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
1673; GENERIC-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
1674; GENERIC-NEXT:    retq # sched: [1:1.00]
1675;
1676; HASWELL-LABEL: test_pblendvb:
1677; HASWELL:       # %bb.0:
1678; HASWELL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
1679; HASWELL-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
1680; HASWELL-NEXT:    retq # sched: [7:1.00]
1681;
1682; BROADWELL-LABEL: test_pblendvb:
1683; BROADWELL:       # %bb.0:
1684; BROADWELL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
1685; BROADWELL-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
1686; BROADWELL-NEXT:    retq # sched: [7:1.00]
1687;
1688; SKYLAKE-LABEL: test_pblendvb:
1689; SKYLAKE:       # %bb.0:
1690; SKYLAKE-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
1691; SKYLAKE-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:0.67]
1692; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1693;
1694; SKX-LABEL: test_pblendvb:
1695; SKX:       # %bb.0:
1696; SKX-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
1697; SKX-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:0.67]
1698; SKX-NEXT:    retq # sched: [7:1.00]
1699;
1700; ZNVER1-LABEL: test_pblendvb:
1701; ZNVER1:       # %bb.0:
1702; ZNVER1-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
1703; ZNVER1-NEXT:    vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
1704; ZNVER1-NEXT:    retq # sched: [1:0.50]
1705  %1 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2)
1706  %2 = load <32 x i8>, <32 x i8> *%a3, align 32
1707  %3 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %1, <32 x i8> %2, <32 x i8> %a4)
1708  ret <32 x i8> %3
1709}
1710declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
1711
1712define <16 x i16> @test_pblendw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
1713; GENERIC-LABEL: test_pblendw:
1714; GENERIC:       # %bb.0:
1715; GENERIC-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:0.50]
1716; GENERIC-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:0.50]
1717; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1718; GENERIC-NEXT:    retq # sched: [1:1.00]
1719;
1720; HASWELL-LABEL: test_pblendw:
1721; HASWELL:       # %bb.0:
1722; HASWELL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
1723; HASWELL-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:1.00]
1724; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1725; HASWELL-NEXT:    retq # sched: [7:1.00]
1726;
1727; BROADWELL-LABEL: test_pblendw:
1728; BROADWELL:       # %bb.0:
1729; BROADWELL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
1730; BROADWELL-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [7:1.00]
1731; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1732; BROADWELL-NEXT:    retq # sched: [7:1.00]
1733;
1734; SKYLAKE-LABEL: test_pblendw:
1735; SKYLAKE:       # %bb.0:
1736; SKYLAKE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
1737; SKYLAKE-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:1.00]
1738; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1739; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1740;
1741; SKX-LABEL: test_pblendw:
1742; SKX:       # %bb.0:
1743; SKX-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
1744; SKX-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [8:1.00]
1745; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1746; SKX-NEXT:    retq # sched: [7:1.00]
1747;
1748; ZNVER1-LABEL: test_pblendw:
1749; ZNVER1:       # %bb.0:
1750; ZNVER1-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [2:0.33]
1751; ZNVER1-NEXT:    vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] sched: [9:0.50]
1752; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1753; ZNVER1-NEXT:    retq # sched: [1:0.50]
1754  %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 28, i32 13, i32 14, i32 15>
1755  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
1756  %3 = shufflevector <16 x i16> %a1, <16 x i16> %2, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
1757  %4 = add <16 x i16> %1, %3
1758  ret <16 x i16> %4
1759}
1760
1761define <16 x i8> @test_pbroadcastb(<16 x i8> %a0, <16 x i8> *%a1) {
1762; GENERIC-LABEL: test_pbroadcastb:
1763; GENERIC:       # %bb.0:
1764; GENERIC-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [1:0.50]
1765; GENERIC-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [7:0.50]
1766; GENERIC-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1767; GENERIC-NEXT:    retq # sched: [1:1.00]
1768;
1769; HASWELL-LABEL: test_pbroadcastb:
1770; HASWELL:       # %bb.0:
1771; HASWELL-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
1772; HASWELL-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [9:1.00]
1773; HASWELL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1774; HASWELL-NEXT:    retq # sched: [7:1.00]
1775;
1776; BROADWELL-LABEL: test_pbroadcastb:
1777; BROADWELL:       # %bb.0:
1778; BROADWELL-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [9:1.00]
1779; BROADWELL-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
1780; BROADWELL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1781; BROADWELL-NEXT:    retq # sched: [7:1.00]
1782;
1783; SKYLAKE-LABEL: test_pbroadcastb:
1784; SKYLAKE:       # %bb.0:
1785; SKYLAKE-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
1786; SKYLAKE-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [7:1.00]
1787; SKYLAKE-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1788; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1789;
1790; SKX-LABEL: test_pbroadcastb:
1791; SKX:       # %bb.0:
1792; SKX-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
1793; SKX-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [7:1.00]
1794; SKX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1795; SKX-NEXT:    retq # sched: [7:1.00]
1796;
1797; ZNVER1-LABEL: test_pbroadcastb:
1798; ZNVER1:       # %bb.0:
1799; ZNVER1-NEXT:    vpbroadcastb (%rdi), %xmm1 # sched: [8:1.00]
1800; ZNVER1-NEXT:    vpbroadcastb %xmm0, %xmm0 # sched: [1:0.25]
1801; ZNVER1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
1802; ZNVER1-NEXT:    retq # sched: [1:0.50]
1803  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
1804  %2 = load <16 x i8>, <16 x i8> *%a1, align 16
1805  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
1806  %4 = add <16 x i8> %1, %3
1807  ret <16 x i8> %4
1808}
1809
1810define <32 x i8> @test_pbroadcastb_ymm(<32 x i8> %a0, <32 x i8> *%a1) {
1811; GENERIC-LABEL: test_pbroadcastb_ymm:
1812; GENERIC:       # %bb.0:
1813; GENERIC-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [1:1.00]
1814; GENERIC-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [7:0.50]
1815; GENERIC-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1816; GENERIC-NEXT:    retq # sched: [1:1.00]
1817;
1818; HASWELL-LABEL: test_pbroadcastb_ymm:
1819; HASWELL:       # %bb.0:
1820; HASWELL-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
1821; HASWELL-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [9:1.00]
1822; HASWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1823; HASWELL-NEXT:    retq # sched: [7:1.00]
1824;
1825; BROADWELL-LABEL: test_pbroadcastb_ymm:
1826; BROADWELL:       # %bb.0:
1827; BROADWELL-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [9:1.00]
1828; BROADWELL-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
1829; BROADWELL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1830; BROADWELL-NEXT:    retq # sched: [7:1.00]
1831;
1832; SKYLAKE-LABEL: test_pbroadcastb_ymm:
1833; SKYLAKE:       # %bb.0:
1834; SKYLAKE-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
1835; SKYLAKE-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [8:1.00]
1836; SKYLAKE-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1837; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1838;
1839; SKX-LABEL: test_pbroadcastb_ymm:
1840; SKX:       # %bb.0:
1841; SKX-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
1842; SKX-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [8:1.00]
1843; SKX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1844; SKX-NEXT:    retq # sched: [7:1.00]
1845;
1846; ZNVER1-LABEL: test_pbroadcastb_ymm:
1847; ZNVER1:       # %bb.0:
1848; ZNVER1-NEXT:    vpbroadcastb (%rdi), %ymm1 # sched: [8:2.00]
1849; ZNVER1-NEXT:    vpbroadcastb %xmm0, %ymm0 # sched: [2:0.25]
1850; ZNVER1-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1851; ZNVER1-NEXT:    retq # sched: [1:0.50]
1852  %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> zeroinitializer
1853  %2 = load <32 x i8>, <32 x i8> *%a1, align 32
1854  %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> zeroinitializer
1855  %4 = add <32 x i8> %1, %3
1856  ret <32 x i8> %4
1857}
1858
1859define <4 x i32> @test_pbroadcastd(<4 x i32> %a0, <4 x i32> *%a1) {
1860; GENERIC-LABEL: test_pbroadcastd:
1861; GENERIC:       # %bb.0:
1862; GENERIC-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:0.50]
1863; GENERIC-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [7:0.50]
1864; GENERIC-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1865; GENERIC-NEXT:    retq # sched: [1:1.00]
1866;
1867; HASWELL-LABEL: test_pbroadcastd:
1868; HASWELL:       # %bb.0:
1869; HASWELL-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
1870; HASWELL-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
1871; HASWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1872; HASWELL-NEXT:    retq # sched: [7:1.00]
1873;
1874; BROADWELL-LABEL: test_pbroadcastd:
1875; BROADWELL:       # %bb.0:
1876; BROADWELL-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
1877; BROADWELL-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [5:0.50]
1878; BROADWELL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1879; BROADWELL-NEXT:    retq # sched: [7:1.00]
1880;
1881; SKYLAKE-LABEL: test_pbroadcastd:
1882; SKYLAKE:       # %bb.0:
1883; SKYLAKE-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
1884; SKYLAKE-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
1885; SKYLAKE-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1886; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1887;
1888; SKX-LABEL: test_pbroadcastd:
1889; SKX:       # %bb.0:
1890; SKX-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
1891; SKX-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
1892; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1893; SKX-NEXT:    retq # sched: [7:1.00]
1894;
1895; ZNVER1-LABEL: test_pbroadcastd:
1896; ZNVER1:       # %bb.0:
1897; ZNVER1-NEXT:    vpbroadcastd (%rdi), %xmm1 # sched: [8:0.50]
1898; ZNVER1-NEXT:    vpbroadcastd %xmm0, %xmm0 # sched: [1:0.25]
1899; ZNVER1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
1900; ZNVER1-NEXT:    retq # sched: [1:0.50]
1901  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer
1902  %2 = load <4 x i32>, <4 x i32> *%a1, align 16
1903  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
1904  %4 = add <4 x i32> %1, %3
1905  ret <4 x i32> %4
1906}
1907
1908define <8 x i32> @test_pbroadcastd_ymm(<8 x i32> %a0, <8 x i32> *%a1) {
1909; GENERIC-LABEL: test_pbroadcastd_ymm:
1910; GENERIC:       # %bb.0:
1911; GENERIC-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [1:1.00]
1912; GENERIC-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
1913; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1914; GENERIC-NEXT:    retq # sched: [1:1.00]
1915;
1916; HASWELL-LABEL: test_pbroadcastd_ymm:
1917; HASWELL:       # %bb.0:
1918; HASWELL-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
1919; HASWELL-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
1920; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1921; HASWELL-NEXT:    retq # sched: [7:1.00]
1922;
1923; BROADWELL-LABEL: test_pbroadcastd_ymm:
1924; BROADWELL:       # %bb.0:
1925; BROADWELL-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
1926; BROADWELL-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [6:0.50]
1927; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
1928; BROADWELL-NEXT:    retq # sched: [7:1.00]
1929;
1930; SKYLAKE-LABEL: test_pbroadcastd_ymm:
1931; SKYLAKE:       # %bb.0:
1932; SKYLAKE-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
1933; SKYLAKE-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
1934; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1935; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1936;
1937; SKX-LABEL: test_pbroadcastd_ymm:
1938; SKX:       # %bb.0:
1939; SKX-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
1940; SKX-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
1941; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
1942; SKX-NEXT:    retq # sched: [7:1.00]
1943;
1944; ZNVER1-LABEL: test_pbroadcastd_ymm:
1945; ZNVER1:       # %bb.0:
1946; ZNVER1-NEXT:    vpbroadcastd (%rdi), %ymm1 # sched: [8:0.50]
1947; ZNVER1-NEXT:    vpbroadcastd %xmm0, %ymm0 # sched: [2:0.25]
1948; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
1949; ZNVER1-NEXT:    retq # sched: [1:0.50]
1950  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
1951  %2 = load <8 x i32>, <8 x i32> *%a1, align 32
1952  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
1953  %4 = add <8 x i32> %1, %3
1954  ret <8 x i32> %4
1955}
1956
1957define <2 x i64> @test_pbroadcastq(<2 x i64> %a0, <2 x i64> *%a1) {
1958; GENERIC-LABEL: test_pbroadcastq:
1959; GENERIC:       # %bb.0:
1960; GENERIC-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:0.50]
1961; GENERIC-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [7:0.50]
1962; GENERIC-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1963; GENERIC-NEXT:    retq # sched: [1:1.00]
1964;
1965; HASWELL-LABEL: test_pbroadcastq:
1966; HASWELL:       # %bb.0:
1967; HASWELL-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
1968; HASWELL-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
1969; HASWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1970; HASWELL-NEXT:    retq # sched: [7:1.00]
1971;
1972; BROADWELL-LABEL: test_pbroadcastq:
1973; BROADWELL:       # %bb.0:
1974; BROADWELL-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
1975; BROADWELL-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [5:0.50]
1976; BROADWELL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1977; BROADWELL-NEXT:    retq # sched: [7:1.00]
1978;
1979; SKYLAKE-LABEL: test_pbroadcastq:
1980; SKYLAKE:       # %bb.0:
1981; SKYLAKE-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
1982; SKYLAKE-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
1983; SKYLAKE-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1984; SKYLAKE-NEXT:    retq # sched: [7:1.00]
1985;
1986; SKX-LABEL: test_pbroadcastq:
1987; SKX:       # %bb.0:
1988; SKX-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
1989; SKX-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
1990; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
1991; SKX-NEXT:    retq # sched: [7:1.00]
1992;
1993; ZNVER1-LABEL: test_pbroadcastq:
1994; ZNVER1:       # %bb.0:
1995; ZNVER1-NEXT:    vpbroadcastq (%rdi), %xmm1 # sched: [8:0.50]
1996; ZNVER1-NEXT:    vpbroadcastq %xmm0, %xmm0 # sched: [1:0.25]
1997; ZNVER1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
1998; ZNVER1-NEXT:    retq # sched: [1:0.50]
1999  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
2000  %2 = load <2 x i64>, <2 x i64> *%a1, align 16
2001  %3 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
2002  %4 = add <2 x i64> %1, %3
2003  ret <2 x i64> %4
2004}
2005
2006define <4 x i64> @test_pbroadcastq_ymm(<4 x i64> %a0, <4 x i64> *%a1) {
2007; GENERIC-LABEL: test_pbroadcastq_ymm:
2008; GENERIC:       # %bb.0:
2009; GENERIC-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [1:1.00]
2010; GENERIC-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
2011; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2012; GENERIC-NEXT:    retq # sched: [1:1.00]
2013;
2014; HASWELL-LABEL: test_pbroadcastq_ymm:
2015; HASWELL:       # %bb.0:
2016; HASWELL-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
2017; HASWELL-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
2018; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2019; HASWELL-NEXT:    retq # sched: [7:1.00]
2020;
2021; BROADWELL-LABEL: test_pbroadcastq_ymm:
2022; BROADWELL:       # %bb.0:
2023; BROADWELL-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
2024; BROADWELL-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [6:0.50]
2025; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2026; BROADWELL-NEXT:    retq # sched: [7:1.00]
2027;
2028; SKYLAKE-LABEL: test_pbroadcastq_ymm:
2029; SKYLAKE:       # %bb.0:
2030; SKYLAKE-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
2031; SKYLAKE-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
2032; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
2033; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2034;
2035; SKX-LABEL: test_pbroadcastq_ymm:
2036; SKX:       # %bb.0:
2037; SKX-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
2038; SKX-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
2039; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
2040; SKX-NEXT:    retq # sched: [7:1.00]
2041;
2042; ZNVER1-LABEL: test_pbroadcastq_ymm:
2043; ZNVER1:       # %bb.0:
2044; ZNVER1-NEXT:    vpbroadcastq (%rdi), %ymm1 # sched: [8:0.50]
2045; ZNVER1-NEXT:    vpbroadcastq %xmm0, %ymm0 # sched: [2:0.25]
2046; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2047; ZNVER1-NEXT:    retq # sched: [1:0.50]
2048  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
2049  %2 = load <4 x i64>, <4 x i64> *%a1, align 32
2050  %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> zeroinitializer
2051  %4 = add <4 x i64> %1, %3
2052  ret <4 x i64> %4
2053}
2054
2055define <8 x i16> @test_pbroadcastw(<8 x i16> %a0, <8 x i16> *%a1) {
2056; GENERIC-LABEL: test_pbroadcastw:
2057; GENERIC:       # %bb.0:
2058; GENERIC-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [1:0.50]
2059; GENERIC-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [7:0.50]
2060; GENERIC-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
2061; GENERIC-NEXT:    retq # sched: [1:1.00]
2062;
2063; HASWELL-LABEL: test_pbroadcastw:
2064; HASWELL:       # %bb.0:
2065; HASWELL-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
2066; HASWELL-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [9:1.00]
2067; HASWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
2068; HASWELL-NEXT:    retq # sched: [7:1.00]
2069;
2070; BROADWELL-LABEL: test_pbroadcastw:
2071; BROADWELL:       # %bb.0:
2072; BROADWELL-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [9:1.00]
2073; BROADWELL-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
2074; BROADWELL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
2075; BROADWELL-NEXT:    retq # sched: [7:1.00]
2076;
2077; SKYLAKE-LABEL: test_pbroadcastw:
2078; SKYLAKE:       # %bb.0:
2079; SKYLAKE-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
2080; SKYLAKE-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [7:1.00]
2081; SKYLAKE-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
2082; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2083;
2084; SKX-LABEL: test_pbroadcastw:
2085; SKX:       # %bb.0:
2086; SKX-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
2087; SKX-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [7:1.00]
2088; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
2089; SKX-NEXT:    retq # sched: [7:1.00]
2090;
2091; ZNVER1-LABEL: test_pbroadcastw:
2092; ZNVER1:       # %bb.0:
2093; ZNVER1-NEXT:    vpbroadcastw (%rdi), %xmm1 # sched: [8:1.00]
2094; ZNVER1-NEXT:    vpbroadcastw %xmm0, %xmm0 # sched: [1:0.25]
2095; ZNVER1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
2096; ZNVER1-NEXT:    retq # sched: [1:0.50]
2097  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
2098  %2 = load <8 x i16>, <8 x i16> *%a1, align 16
2099  %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
2100  %4 = add <8 x i16> %1, %3
2101  ret <8 x i16> %4
2102}
2103
2104define <16 x i16> @test_pbroadcastw_ymm(<16 x i16> %a0, <16 x i16> *%a1) {
2105; GENERIC-LABEL: test_pbroadcastw_ymm:
2106; GENERIC:       # %bb.0:
2107; GENERIC-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [1:1.00]
2108; GENERIC-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [7:0.50]
2109; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2110; GENERIC-NEXT:    retq # sched: [1:1.00]
2111;
2112; HASWELL-LABEL: test_pbroadcastw_ymm:
2113; HASWELL:       # %bb.0:
2114; HASWELL-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
2115; HASWELL-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [9:1.00]
2116; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2117; HASWELL-NEXT:    retq # sched: [7:1.00]
2118;
2119; BROADWELL-LABEL: test_pbroadcastw_ymm:
2120; BROADWELL:       # %bb.0:
2121; BROADWELL-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [9:1.00]
2122; BROADWELL-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
2123; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2124; BROADWELL-NEXT:    retq # sched: [7:1.00]
2125;
2126; SKYLAKE-LABEL: test_pbroadcastw_ymm:
2127; SKYLAKE:       # %bb.0:
2128; SKYLAKE-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
2129; SKYLAKE-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [8:1.00]
2130; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
2131; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2132;
2133; SKX-LABEL: test_pbroadcastw_ymm:
2134; SKX:       # %bb.0:
2135; SKX-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
2136; SKX-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [8:1.00]
2137; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
2138; SKX-NEXT:    retq # sched: [7:1.00]
2139;
2140; ZNVER1-LABEL: test_pbroadcastw_ymm:
2141; ZNVER1:       # %bb.0:
2142; ZNVER1-NEXT:    vpbroadcastw (%rdi), %ymm1 # sched: [8:2.00]
2143; ZNVER1-NEXT:    vpbroadcastw %xmm0, %ymm0 # sched: [2:0.25]
2144; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2145; ZNVER1-NEXT:    retq # sched: [1:0.50]
2146  %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer
2147  %2 = load <16 x i16>, <16 x i16> *%a1, align 32
2148  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> zeroinitializer
2149  %4 = add <16 x i16> %1, %3
2150  ret <16 x i16> %4
2151}
2152
2153define <32 x i8> @test_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
2154; GENERIC-LABEL: test_pcmpeqb:
2155; GENERIC:       # %bb.0:
2156; GENERIC-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2157; GENERIC-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2158; GENERIC-NEXT:    retq # sched: [1:1.00]
2159;
2160; HASWELL-LABEL: test_pcmpeqb:
2161; HASWELL:       # %bb.0:
2162; HASWELL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2163; HASWELL-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2164; HASWELL-NEXT:    retq # sched: [7:1.00]
2165;
2166; BROADWELL-LABEL: test_pcmpeqb:
2167; BROADWELL:       # %bb.0:
2168; BROADWELL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2169; BROADWELL-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
2170; BROADWELL-NEXT:    retq # sched: [7:1.00]
2171;
2172; SKYLAKE-LABEL: test_pcmpeqb:
2173; SKYLAKE:       # %bb.0:
2174; SKYLAKE-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2175; SKYLAKE-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2176; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2177;
2178; SKX-LABEL: test_pcmpeqb:
2179; SKX:       # %bb.0:
2180; SKX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2181; SKX-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2182; SKX-NEXT:    retq # sched: [7:1.00]
2183;
2184; ZNVER1-LABEL: test_pcmpeqb:
2185; ZNVER1:       # %bb.0:
2186; ZNVER1-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2187; ZNVER1-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2188; ZNVER1-NEXT:    retq # sched: [1:0.50]
2189  %1 = icmp eq <32 x i8> %a0, %a1
2190  %2 = sext <32 x i1> %1 to <32 x i8>
2191  %3 = load <32 x i8>, <32 x i8> *%a2, align 32
2192  %4 = icmp eq <32 x i8> %2, %3
2193  %5 = sext <32 x i1> %4 to <32 x i8>
2194  ret <32 x i8> %5
2195}
2196
2197define <8 x i32> @test_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
2198; GENERIC-LABEL: test_pcmpeqd:
2199; GENERIC:       # %bb.0:
2200; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2201; GENERIC-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2202; GENERIC-NEXT:    retq # sched: [1:1.00]
2203;
2204; HASWELL-LABEL: test_pcmpeqd:
2205; HASWELL:       # %bb.0:
2206; HASWELL-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2207; HASWELL-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2208; HASWELL-NEXT:    retq # sched: [7:1.00]
2209;
2210; BROADWELL-LABEL: test_pcmpeqd:
2211; BROADWELL:       # %bb.0:
2212; BROADWELL-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2213; BROADWELL-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
2214; BROADWELL-NEXT:    retq # sched: [7:1.00]
2215;
2216; SKYLAKE-LABEL: test_pcmpeqd:
2217; SKYLAKE:       # %bb.0:
2218; SKYLAKE-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2219; SKYLAKE-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2220; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2221;
2222; SKX-LABEL: test_pcmpeqd:
2223; SKX:       # %bb.0:
2224; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2225; SKX-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2226; SKX-NEXT:    retq # sched: [7:1.00]
2227;
2228; ZNVER1-LABEL: test_pcmpeqd:
2229; ZNVER1:       # %bb.0:
2230; ZNVER1-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2231; ZNVER1-NEXT:    vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2232; ZNVER1-NEXT:    retq # sched: [1:0.50]
2233  %1 = icmp eq <8 x i32> %a0, %a1
2234  %2 = sext <8 x i1> %1 to <8 x i32>
2235  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
2236  %4 = icmp eq <8 x i32> %2, %3
2237  %5 = sext <8 x i1> %4 to <8 x i32>
2238  ret <8 x i32> %5
2239}
2240
2241define <4 x i64> @test_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
2242; GENERIC-LABEL: test_pcmpeqq:
2243; GENERIC:       # %bb.0:
2244; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2245; GENERIC-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2246; GENERIC-NEXT:    retq # sched: [1:1.00]
2247;
2248; HASWELL-LABEL: test_pcmpeqq:
2249; HASWELL:       # %bb.0:
2250; HASWELL-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2251; HASWELL-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2252; HASWELL-NEXT:    retq # sched: [7:1.00]
2253;
2254; BROADWELL-LABEL: test_pcmpeqq:
2255; BROADWELL:       # %bb.0:
2256; BROADWELL-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2257; BROADWELL-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
2258; BROADWELL-NEXT:    retq # sched: [7:1.00]
2259;
2260; SKYLAKE-LABEL: test_pcmpeqq:
2261; SKYLAKE:       # %bb.0:
2262; SKYLAKE-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2263; SKYLAKE-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2264; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2265;
2266; SKX-LABEL: test_pcmpeqq:
2267; SKX:       # %bb.0:
2268; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2269; SKX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2270; SKX-NEXT:    retq # sched: [7:1.00]
2271;
2272; ZNVER1-LABEL: test_pcmpeqq:
2273; ZNVER1:       # %bb.0:
2274; ZNVER1-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2275; ZNVER1-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2276; ZNVER1-NEXT:    retq # sched: [1:0.50]
2277  %1 = icmp eq <4 x i64> %a0, %a1
2278  %2 = sext <4 x i1> %1 to <4 x i64>
2279  %3 = load <4 x i64>, <4 x i64> *%a2, align 32
2280  %4 = icmp eq <4 x i64> %2, %3
2281  %5 = sext <4 x i1> %4 to <4 x i64>
2282  ret <4 x i64> %5
2283}
2284
2285define <16 x i16> @test_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
2286; GENERIC-LABEL: test_pcmpeqw:
2287; GENERIC:       # %bb.0:
2288; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2289; GENERIC-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2290; GENERIC-NEXT:    retq # sched: [1:1.00]
2291;
2292; HASWELL-LABEL: test_pcmpeqw:
2293; HASWELL:       # %bb.0:
2294; HASWELL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2295; HASWELL-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2296; HASWELL-NEXT:    retq # sched: [7:1.00]
2297;
2298; BROADWELL-LABEL: test_pcmpeqw:
2299; BROADWELL:       # %bb.0:
2300; BROADWELL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2301; BROADWELL-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
2302; BROADWELL-NEXT:    retq # sched: [7:1.00]
2303;
2304; SKYLAKE-LABEL: test_pcmpeqw:
2305; SKYLAKE:       # %bb.0:
2306; SKYLAKE-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2307; SKYLAKE-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2308; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2309;
2310; SKX-LABEL: test_pcmpeqw:
2311; SKX:       # %bb.0:
2312; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2313; SKX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2314; SKX-NEXT:    retq # sched: [7:1.00]
2315;
2316; ZNVER1-LABEL: test_pcmpeqw:
2317; ZNVER1:       # %bb.0:
2318; ZNVER1-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2319; ZNVER1-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2320; ZNVER1-NEXT:    retq # sched: [1:0.50]
2321  %1 = icmp eq <16 x i16> %a0, %a1
2322  %2 = sext <16 x i1> %1 to <16 x i16>
2323  %3 = load <16 x i16>, <16 x i16> *%a2, align 32
2324  %4 = icmp eq <16 x i16> %2, %3
2325  %5 = sext <16 x i1> %4 to <16 x i16>
2326  ret <16 x i16> %5
2327}
2328
2329define <32 x i8> @test_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
2330; GENERIC-LABEL: test_pcmpgtb:
2331; GENERIC:       # %bb.0:
2332; GENERIC-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2333; GENERIC-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2334; GENERIC-NEXT:    retq # sched: [1:1.00]
2335;
2336; HASWELL-LABEL: test_pcmpgtb:
2337; HASWELL:       # %bb.0:
2338; HASWELL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2339; HASWELL-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2340; HASWELL-NEXT:    retq # sched: [7:1.00]
2341;
2342; BROADWELL-LABEL: test_pcmpgtb:
2343; BROADWELL:       # %bb.0:
2344; BROADWELL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2345; BROADWELL-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
2346; BROADWELL-NEXT:    retq # sched: [7:1.00]
2347;
2348; SKYLAKE-LABEL: test_pcmpgtb:
2349; SKYLAKE:       # %bb.0:
2350; SKYLAKE-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2351; SKYLAKE-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2352; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2353;
2354; SKX-LABEL: test_pcmpgtb:
2355; SKX:       # %bb.0:
2356; SKX-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2357; SKX-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2358; SKX-NEXT:    retq # sched: [7:1.00]
2359;
2360; ZNVER1-LABEL: test_pcmpgtb:
2361; ZNVER1:       # %bb.0:
2362; ZNVER1-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2363; ZNVER1-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2364; ZNVER1-NEXT:    retq # sched: [1:0.50]
2365  %1 = icmp sgt <32 x i8> %a0, %a1
2366  %2 = sext <32 x i1> %1 to <32 x i8>
2367  %3 = load <32 x i8>, <32 x i8> *%a2, align 32
2368  %4 = icmp sgt <32 x i8> %2, %3
2369  %5 = sext <32 x i1> %4 to <32 x i8>
2370  ret <32 x i8> %5
2371}
2372
2373define <8 x i32> @test_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
2374; GENERIC-LABEL: test_pcmpgtd:
2375; GENERIC:       # %bb.0:
2376; GENERIC-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2377; GENERIC-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2378; GENERIC-NEXT:    retq # sched: [1:1.00]
2379;
2380; HASWELL-LABEL: test_pcmpgtd:
2381; HASWELL:       # %bb.0:
2382; HASWELL-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2383; HASWELL-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2384; HASWELL-NEXT:    retq # sched: [7:1.00]
2385;
2386; BROADWELL-LABEL: test_pcmpgtd:
2387; BROADWELL:       # %bb.0:
2388; BROADWELL-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2389; BROADWELL-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
2390; BROADWELL-NEXT:    retq # sched: [7:1.00]
2391;
2392; SKYLAKE-LABEL: test_pcmpgtd:
2393; SKYLAKE:       # %bb.0:
2394; SKYLAKE-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2395; SKYLAKE-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2396; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2397;
2398; SKX-LABEL: test_pcmpgtd:
2399; SKX:       # %bb.0:
2400; SKX-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2401; SKX-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2402; SKX-NEXT:    retq # sched: [7:1.00]
2403;
2404; ZNVER1-LABEL: test_pcmpgtd:
2405; ZNVER1:       # %bb.0:
2406; ZNVER1-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2407; ZNVER1-NEXT:    vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2408; ZNVER1-NEXT:    retq # sched: [1:0.50]
2409  %1 = icmp sgt <8 x i32> %a0, %a1
2410  %2 = sext <8 x i1> %1 to <8 x i32>
2411  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
2412  %4 = icmp sgt <8 x i32> %2, %3
2413  %5 = sext <8 x i1> %4 to <8 x i32>
2414  ret <8 x i32> %5
2415}
2416
2417define <4 x i64> @test_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
2418; GENERIC-LABEL: test_pcmpgtq:
2419; GENERIC:       # %bb.0:
2420; GENERIC-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2421; GENERIC-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2422; GENERIC-NEXT:    retq # sched: [1:1.00]
2423;
2424; HASWELL-LABEL: test_pcmpgtq:
2425; HASWELL:       # %bb.0:
2426; HASWELL-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
2427; HASWELL-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
2428; HASWELL-NEXT:    retq # sched: [7:1.00]
2429;
2430; BROADWELL-LABEL: test_pcmpgtq:
2431; BROADWELL:       # %bb.0:
2432; BROADWELL-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
2433; BROADWELL-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
2434; BROADWELL-NEXT:    retq # sched: [7:1.00]
2435;
2436; SKYLAKE-LABEL: test_pcmpgtq:
2437; SKYLAKE:       # %bb.0:
2438; SKYLAKE-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
2439; SKYLAKE-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2440; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2441;
2442; SKX-LABEL: test_pcmpgtq:
2443; SKX:       # %bb.0:
2444; SKX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
2445; SKX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2446; SKX-NEXT:    retq # sched: [7:1.00]
2447;
2448; ZNVER1-LABEL: test_pcmpgtq:
2449; ZNVER1:       # %bb.0:
2450; ZNVER1-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2451; ZNVER1-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
2452; ZNVER1-NEXT:    retq # sched: [1:0.50]
2453  %1 = icmp sgt <4 x i64> %a0, %a1
2454  %2 = sext <4 x i1> %1 to <4 x i64>
2455  %3 = load <4 x i64>, <4 x i64> *%a2, align 32
2456  %4 = icmp sgt <4 x i64> %2, %3
2457  %5 = sext <4 x i1> %4 to <4 x i64>
2458  ret <4 x i64> %5
2459}
2460
2461define <16 x i16> @test_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
2462; GENERIC-LABEL: test_pcmpgtw:
2463; GENERIC:       # %bb.0:
2464; GENERIC-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2465; GENERIC-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2466; GENERIC-NEXT:    retq # sched: [1:1.00]
2467;
2468; HASWELL-LABEL: test_pcmpgtw:
2469; HASWELL:       # %bb.0:
2470; HASWELL-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2471; HASWELL-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2472; HASWELL-NEXT:    retq # sched: [7:1.00]
2473;
2474; BROADWELL-LABEL: test_pcmpgtw:
2475; BROADWELL:       # %bb.0:
2476; BROADWELL-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2477; BROADWELL-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
2478; BROADWELL-NEXT:    retq # sched: [7:1.00]
2479;
2480; SKYLAKE-LABEL: test_pcmpgtw:
2481; SKYLAKE:       # %bb.0:
2482; SKYLAKE-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2483; SKYLAKE-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2484; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2485;
2486; SKX-LABEL: test_pcmpgtw:
2487; SKX:       # %bb.0:
2488; SKX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2489; SKX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2490; SKX-NEXT:    retq # sched: [7:1.00]
2491;
2492; ZNVER1-LABEL: test_pcmpgtw:
2493; ZNVER1:       # %bb.0:
2494; ZNVER1-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2495; ZNVER1-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
2496; ZNVER1-NEXT:    retq # sched: [1:0.50]
2497  %1 = icmp sgt <16 x i16> %a0, %a1
2498  %2 = sext <16 x i1> %1 to <16 x i16>
2499  %3 = load <16 x i16>, <16 x i16> *%a2, align 32
2500  %4 = icmp sgt <16 x i16> %2, %3
2501  %5 = sext <16 x i1> %4 to <16 x i16>
2502  ret <16 x i16> %5
2503}
2504
2505define <4 x i64> @test_perm2i128(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
2506; GENERIC-LABEL: test_perm2i128:
2507; GENERIC:       # %bb.0:
2508; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
2509; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
2510; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
2511; GENERIC-NEXT:    retq # sched: [1:1.00]
2512;
2513; HASWELL-LABEL: test_perm2i128:
2514; HASWELL:       # %bb.0:
2515; HASWELL-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
2516; HASWELL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
2517; HASWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
2518; HASWELL-NEXT:    retq # sched: [7:1.00]
2519;
2520; BROADWELL-LABEL: test_perm2i128:
2521; BROADWELL:       # %bb.0:
2522; BROADWELL-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
2523; BROADWELL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [9:1.00]
2524; BROADWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
2525; BROADWELL-NEXT:    retq # sched: [7:1.00]
2526;
2527; SKYLAKE-LABEL: test_perm2i128:
2528; SKYLAKE:       # %bb.0:
2529; SKYLAKE-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
2530; SKYLAKE-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
2531; SKYLAKE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
2532; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2533;
2534; SKX-LABEL: test_perm2i128:
2535; SKX:       # %bb.0:
2536; SKX-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
2537; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
2538; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
2539; SKX-NEXT:    retq # sched: [7:1.00]
2540;
2541; ZNVER1-LABEL: test_perm2i128:
2542; ZNVER1:       # %bb.0:
2543; ZNVER1-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [2:0.25]
2544; ZNVER1-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [9:0.50]
2545; ZNVER1-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
2546; ZNVER1-NEXT:    retq # sched: [1:0.50]
2547  %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2548  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
2549  %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2550  %4 = add <4 x i64> %1, %3
2551  ret <4 x i64> %4
2552}
2553
2554define <8 x i32> @test_permd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
2555; GENERIC-LABEL: test_permd:
2556; GENERIC:       # %bb.0:
2557; GENERIC-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
2558; GENERIC-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
2559; GENERIC-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
2560; GENERIC-NEXT:    retq # sched: [1:1.00]
2561;
2562; HASWELL-LABEL: test_permd:
2563; HASWELL:       # %bb.0:
2564; HASWELL-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2565; HASWELL-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2566; HASWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
2567; HASWELL-NEXT:    retq # sched: [7:1.00]
2568;
2569; BROADWELL-LABEL: test_permd:
2570; BROADWELL:       # %bb.0:
2571; BROADWELL-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2572; BROADWELL-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
2573; BROADWELL-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
2574; BROADWELL-NEXT:    retq # sched: [7:1.00]
2575;
2576; SKYLAKE-LABEL: test_permd:
2577; SKYLAKE:       # %bb.0:
2578; SKYLAKE-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2579; SKYLAKE-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2580; SKYLAKE-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
2581; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2582;
2583; SKX-LABEL: test_permd:
2584; SKX:       # %bb.0:
2585; SKX-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2586; SKX-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2587; SKX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
2588; SKX-NEXT:    retq # sched: [7:1.00]
2589;
2590; ZNVER1-LABEL: test_permd:
2591; ZNVER1:       # %bb.0:
2592; ZNVER1-NEXT:    vpermd %ymm1, %ymm0, %ymm1 # sched: [2:0.25]
2593; ZNVER1-NEXT:    vpermd (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
2594; ZNVER1-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
2595; ZNVER1-NEXT:    retq # sched: [1:0.50]
2596  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
2597  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
2598  %3 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> %a0)
2599  %4 = add <8 x i32> %1, %3
2600  ret <8 x i32> %4
2601}
2602declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2603
2604define <4 x double> @test_permpd(<4 x double> %a0, <4 x double> *%a1) {
2605; GENERIC-LABEL: test_permpd:
2606; GENERIC:       # %bb.0:
2607; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
2608; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00]
2609; GENERIC-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
2610; GENERIC-NEXT:    retq # sched: [1:1.00]
2611;
2612; HASWELL-LABEL: test_permpd:
2613; HASWELL:       # %bb.0:
2614; HASWELL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2615; HASWELL-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
2616; HASWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
2617; HASWELL-NEXT:    retq # sched: [7:1.00]
2618;
2619; BROADWELL-LABEL: test_permpd:
2620; BROADWELL:       # %bb.0:
2621; BROADWELL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2622; BROADWELL-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:1.00]
2623; BROADWELL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
2624; BROADWELL-NEXT:    retq # sched: [7:1.00]
2625;
2626; SKYLAKE-LABEL: test_permpd:
2627; SKYLAKE:       # %bb.0:
2628; SKYLAKE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2629; SKYLAKE-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
2630; SKYLAKE-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
2631; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2632;
2633; SKX-LABEL: test_permpd:
2634; SKX:       # %bb.0:
2635; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2636; SKX-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
2637; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
2638; SKX-NEXT:    retq # sched: [7:1.00]
2639;
2640; ZNVER1-LABEL: test_permpd:
2641; ZNVER1:       # %bb.0:
2642; ZNVER1-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [107:0.50]
2643; ZNVER1-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [100:0.25]
2644; ZNVER1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
2645; ZNVER1-NEXT:    retq # sched: [1:0.50]
2646  %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
2647  %2 = load <4 x double>, <4 x double> *%a1, align 32
2648  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
2649  %4 = fadd <4 x double> %1, %3
2650  ret <4 x double> %4
2651}
2652
2653define <8 x float> @test_permps(<8 x i32> %a0, <8 x float> %a1, <8 x float> *%a2) {
2654; GENERIC-LABEL: test_permps:
2655; GENERIC:       # %bb.0:
2656; GENERIC-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
2657; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
2658; GENERIC-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
2659; GENERIC-NEXT:    retq # sched: [1:1.00]
2660;
2661; HASWELL-LABEL: test_permps:
2662; HASWELL:       # %bb.0:
2663; HASWELL-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2664; HASWELL-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2665; HASWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
2666; HASWELL-NEXT:    retq # sched: [7:1.00]
2667;
2668; BROADWELL-LABEL: test_permps:
2669; BROADWELL:       # %bb.0:
2670; BROADWELL-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2671; BROADWELL-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
2672; BROADWELL-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
2673; BROADWELL-NEXT:    retq # sched: [7:1.00]
2674;
2675; SKYLAKE-LABEL: test_permps:
2676; SKYLAKE:       # %bb.0:
2677; SKYLAKE-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2678; SKYLAKE-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2679; SKYLAKE-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
2680; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2681;
2682; SKX-LABEL: test_permps:
2683; SKX:       # %bb.0:
2684; SKX-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
2685; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2686; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
2687; SKX-NEXT:    retq # sched: [7:1.00]
2688;
2689; ZNVER1-LABEL: test_permps:
2690; ZNVER1:       # %bb.0:
2691; ZNVER1-NEXT:    vpermps %ymm1, %ymm0, %ymm1 # sched: [100:0.25]
2692; ZNVER1-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [107:0.50]
2693; ZNVER1-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
2694; ZNVER1-NEXT:    retq # sched: [1:0.50]
2695  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
2696  %2 = load <8 x float>, <8 x float> *%a2, align 32
2697  %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> %a0)
2698  %4 = fadd <8 x float> %1, %3
2699  ret <8 x float> %4
2700}
2701declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2702
2703define <4 x i64> @test_permq(<4 x i64> %a0, <4 x i64> *%a1) {
2704; GENERIC-LABEL: test_permq:
2705; GENERIC:       # %bb.0:
2706; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
2707; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [8:1.00]
2708; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2709; GENERIC-NEXT:    retq # sched: [1:1.00]
2710;
2711; HASWELL-LABEL: test_permq:
2712; HASWELL:       # %bb.0:
2713; HASWELL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2714; HASWELL-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
2715; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2716; HASWELL-NEXT:    retq # sched: [7:1.00]
2717;
2718; BROADWELL-LABEL: test_permq:
2719; BROADWELL:       # %bb.0:
2720; BROADWELL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2721; BROADWELL-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:1.00]
2722; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
2723; BROADWELL-NEXT:    retq # sched: [7:1.00]
2724;
2725; SKYLAKE-LABEL: test_permq:
2726; SKYLAKE:       # %bb.0:
2727; SKYLAKE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2728; SKYLAKE-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
2729; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
2730; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2731;
2732; SKX-LABEL: test_permq:
2733; SKX:       # %bb.0:
2734; SKX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
2735; SKX-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
2736; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
2737; SKX-NEXT:    retq # sched: [7:1.00]
2738;
2739; ZNVER1-LABEL: test_permq:
2740; ZNVER1:       # %bb.0:
2741; ZNVER1-NEXT:    vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:0.50]
2742; ZNVER1-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [2:0.25]
2743; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
2744; ZNVER1-NEXT:    retq # sched: [1:0.50]
2745  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
2746  %2 = load <4 x i64>, <4 x i64> *%a1, align 32
2747  %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
2748  %4 = add <4 x i64> %1, %3
2749  ret <4 x i64> %4
2750}
2751
2752define <4 x i32> @test_pgatherdd(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32> %a3) {
2753; GENERIC-LABEL: test_pgatherdd:
2754; GENERIC:       # %bb.0:
2755; GENERIC-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2756; GENERIC-NEXT:    retq # sched: [1:1.00]
2757;
2758; HASWELL-LABEL: test_pgatherdd:
2759; HASWELL:       # %bb.0:
2760; HASWELL-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
2761; HASWELL-NEXT:    retq # sched: [7:1.00]
2762;
2763; BROADWELL-LABEL: test_pgatherdd:
2764; BROADWELL:       # %bb.0:
2765; BROADWELL-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2766; BROADWELL-NEXT:    retq # sched: [7:1.00]
2767;
2768; SKYLAKE-LABEL: test_pgatherdd:
2769; SKYLAKE:       # %bb.0:
2770; SKYLAKE-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2771; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2772;
2773; SKX-LABEL: test_pgatherdd:
2774; SKX:       # %bb.0:
2775; SKX-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2776; SKX-NEXT:    retq # sched: [7:1.00]
2777;
2778; ZNVER1-LABEL: test_pgatherdd:
2779; ZNVER1:       # %bb.0:
2780; ZNVER1-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
2781; ZNVER1-NEXT:    retq # sched: [1:0.50]
2782  %1 = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32> %a3, i8 2)
2783  ret <4 x i32> %1
2784}
2785declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
2786
2787define <8 x i32> @test_pgatherdd_ymm(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x i32> %a3) {
2788; GENERIC-LABEL: test_pgatherdd_ymm:
2789; GENERIC:       # %bb.0:
2790; GENERIC-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
2791; GENERIC-NEXT:    retq # sched: [1:1.00]
2792;
2793; HASWELL-LABEL: test_pgatherdd_ymm:
2794; HASWELL:       # %bb.0:
2795; HASWELL-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [27:6.50]
2796; HASWELL-NEXT:    retq # sched: [7:1.00]
2797;
2798; BROADWELL-LABEL: test_pgatherdd_ymm:
2799; BROADWELL:       # %bb.0:
2800; BROADWELL-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
2801; BROADWELL-NEXT:    retq # sched: [7:1.00]
2802;
2803; SKYLAKE-LABEL: test_pgatherdd_ymm:
2804; SKYLAKE:       # %bb.0:
2805; SKYLAKE-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
2806; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2807;
2808; SKX-LABEL: test_pgatherdd_ymm:
2809; SKX:       # %bb.0:
2810; SKX-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
2811; SKX-NEXT:    retq # sched: [7:1.00]
2812;
2813; ZNVER1-LABEL: test_pgatherdd_ymm:
2814; ZNVER1:       # %bb.0:
2815; ZNVER1-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [100:0.25]
2816; ZNVER1-NEXT:    retq # sched: [1:0.50]
2817  %1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x i32> %a3, i8 2)
2818  ret <8 x i32> %1
2819}
2820declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
2821
2822define <2 x i64> @test_pgatherdq(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64> %a3) {
2823; GENERIC-LABEL: test_pgatherdq:
2824; GENERIC:       # %bb.0:
2825; GENERIC-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2826; GENERIC-NEXT:    retq # sched: [1:1.00]
2827;
2828; HASWELL-LABEL: test_pgatherdq:
2829; HASWELL:       # %bb.0:
2830; HASWELL-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
2831; HASWELL-NEXT:    retq # sched: [7:1.00]
2832;
2833; BROADWELL-LABEL: test_pgatherdq:
2834; BROADWELL:       # %bb.0:
2835; BROADWELL-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2836; BROADWELL-NEXT:    retq # sched: [7:1.00]
2837;
2838; SKYLAKE-LABEL: test_pgatherdq:
2839; SKYLAKE:       # %bb.0:
2840; SKYLAKE-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2841; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2842;
2843; SKX-LABEL: test_pgatherdq:
2844; SKX:       # %bb.0:
2845; SKX-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2846; SKX-NEXT:    retq # sched: [7:1.00]
2847;
2848; ZNVER1-LABEL: test_pgatherdq:
2849; ZNVER1:       # %bb.0:
2850; ZNVER1-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
2851; ZNVER1-NEXT:    retq # sched: [1:0.50]
2852  %1 = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64> %a3, i8 2)
2853  ret <2 x i64> %1
2854}
2855declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
2856
2857define <4 x i64> @test_pgatherdq_ymm(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x i64> %a3) {
2858; GENERIC-LABEL: test_pgatherdq_ymm:
2859; GENERIC:       # %bb.0:
2860; GENERIC-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [5:0.50]
2861; GENERIC-NEXT:    retq # sched: [1:1.00]
2862;
2863; HASWELL-LABEL: test_pgatherdq_ymm:
2864; HASWELL:       # %bb.0:
2865; HASWELL-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [27:4.00]
2866; HASWELL-NEXT:    retq # sched: [7:1.00]
2867;
2868; BROADWELL-LABEL: test_pgatherdq_ymm:
2869; BROADWELL:       # %bb.0:
2870; BROADWELL-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [5:0.50]
2871; BROADWELL-NEXT:    retq # sched: [7:1.00]
2872;
2873; SKYLAKE-LABEL: test_pgatherdq_ymm:
2874; SKYLAKE:       # %bb.0:
2875; SKYLAKE-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [25:1.00]
2876; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2877;
2878; SKX-LABEL: test_pgatherdq_ymm:
2879; SKX:       # %bb.0:
2880; SKX-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [25:1.00]
2881; SKX-NEXT:    retq # sched: [7:1.00]
2882;
2883; ZNVER1-LABEL: test_pgatherdq_ymm:
2884; ZNVER1:       # %bb.0:
2885; ZNVER1-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [100:0.25]
2886; ZNVER1-NEXT:    retq # sched: [1:0.50]
2887  %1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x i64> %a3, i8 2)
2888  ret <4 x i64> %1
2889}
2890declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
2891
2892define <4 x i32> @test_pgatherqd(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32> %a3) {
2893; GENERIC-LABEL: test_pgatherqd:
2894; GENERIC:       # %bb.0:
2895; GENERIC-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2896; GENERIC-NEXT:    retq # sched: [1:1.00]
2897;
2898; HASWELL-LABEL: test_pgatherqd:
2899; HASWELL:       # %bb.0:
2900; HASWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:5.00]
2901; HASWELL-NEXT:    retq # sched: [7:1.00]
2902;
2903; BROADWELL-LABEL: test_pgatherqd:
2904; BROADWELL:       # %bb.0:
2905; BROADWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2906; BROADWELL-NEXT:    retq # sched: [7:1.00]
2907;
2908; SKYLAKE-LABEL: test_pgatherqd:
2909; SKYLAKE:       # %bb.0:
2910; SKYLAKE-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2911; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2912;
2913; SKX-LABEL: test_pgatherqd:
2914; SKX:       # %bb.0:
2915; SKX-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2916; SKX-NEXT:    retq # sched: [7:1.00]
2917;
2918; ZNVER1-LABEL: test_pgatherqd:
2919; ZNVER1:       # %bb.0:
2920; ZNVER1-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
2921; ZNVER1-NEXT:    retq # sched: [1:0.50]
2922  %1 = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32> %a3, i8 2)
2923  ret <4 x i32> %1
2924}
2925declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
2926
2927define <4 x i32> @test_pgatherqd_ymm(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x i32> %a3) {
2928; GENERIC-LABEL: test_pgatherqd_ymm:
2929; GENERIC:       # %bb.0:
2930; GENERIC-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50]
2931; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
2932; GENERIC-NEXT:    retq # sched: [1:1.00]
2933;
2934; HASWELL-LABEL: test_pgatherqd_ymm:
2935; HASWELL:       # %bb.0:
2936; HASWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [28:5.00]
2937; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
2938; HASWELL-NEXT:    retq # sched: [7:1.00]
2939;
2940; BROADWELL-LABEL: test_pgatherqd_ymm:
2941; BROADWELL:       # %bb.0:
2942; BROADWELL-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50]
2943; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
2944; BROADWELL-NEXT:    retq # sched: [7:1.00]
2945;
2946; SKYLAKE-LABEL: test_pgatherqd_ymm:
2947; SKYLAKE:       # %bb.0:
2948; SKYLAKE-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
2949; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
2950; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2951;
2952; SKX-LABEL: test_pgatherqd_ymm:
2953; SKX:       # %bb.0:
2954; SKX-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
2955; SKX-NEXT:    vzeroupper # sched: [4:1.00]
2956; SKX-NEXT:    retq # sched: [7:1.00]
2957;
2958; ZNVER1-LABEL: test_pgatherqd_ymm:
2959; ZNVER1:       # %bb.0:
2960; ZNVER1-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [100:0.25]
2961; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
2962; ZNVER1-NEXT:    retq # sched: [1:0.50]
2963  %1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x i32> %a3, i8 2)
2964  ret <4 x i32> %1
2965}
2966declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
2967
2968define <2 x i64> @test_pgatherqq(<2 x i64> %a0, i8 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
2969; GENERIC-LABEL: test_pgatherqq:
2970; GENERIC:       # %bb.0:
2971; GENERIC-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2972; GENERIC-NEXT:    retq # sched: [1:1.00]
2973;
2974; HASWELL-LABEL: test_pgatherqq:
2975; HASWELL:       # %bb.0:
2976; HASWELL-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [23:3.33]
2977; HASWELL-NEXT:    retq # sched: [7:1.00]
2978;
2979; BROADWELL-LABEL: test_pgatherqq:
2980; BROADWELL:       # %bb.0:
2981; BROADWELL-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
2982; BROADWELL-NEXT:    retq # sched: [7:1.00]
2983;
2984; SKYLAKE-LABEL: test_pgatherqq:
2985; SKYLAKE:       # %bb.0:
2986; SKYLAKE-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2987; SKYLAKE-NEXT:    retq # sched: [7:1.00]
2988;
2989; SKX-LABEL: test_pgatherqq:
2990; SKX:       # %bb.0:
2991; SKX-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
2992; SKX-NEXT:    retq # sched: [7:1.00]
2993;
2994; ZNVER1-LABEL: test_pgatherqq:
2995; ZNVER1:       # %bb.0:
2996; ZNVER1-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:0.25]
2997; ZNVER1-NEXT:    retq # sched: [1:0.50]
2998  %1 = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %a1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
2999  ret <2 x i64> %1
3000}
3001declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
3002
3003define <4 x i64> @test_pgatherqq_ymm(<4 x i64> %a0, i8 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
3004; GENERIC-LABEL: test_pgatherqq_ymm:
3005; GENERIC:       # %bb.0:
3006; GENERIC-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
3007; GENERIC-NEXT:    retq # sched: [1:1.00]
3008;
3009; HASWELL-LABEL: test_pgatherqq_ymm:
3010; HASWELL:       # %bb.0:
3011; HASWELL-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [24:5.00]
3012; HASWELL-NEXT:    retq # sched: [7:1.00]
3013;
3014; BROADWELL-LABEL: test_pgatherqq_ymm:
3015; BROADWELL:       # %bb.0:
3016; BROADWELL-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
3017; BROADWELL-NEXT:    retq # sched: [7:1.00]
3018;
3019; SKYLAKE-LABEL: test_pgatherqq_ymm:
3020; SKYLAKE:       # %bb.0:
3021; SKYLAKE-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
3022; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3023;
3024; SKX-LABEL: test_pgatherqq_ymm:
3025; SKX:       # %bb.0:
3026; SKX-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
3027; SKX-NEXT:    retq # sched: [7:1.00]
3028;
3029; ZNVER1-LABEL: test_pgatherqq_ymm:
3030; ZNVER1:       # %bb.0:
3031; ZNVER1-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [100:0.25]
3032; ZNVER1-NEXT:    retq # sched: [1:0.50]
3033  %1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %a1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
3034  ret <4 x i64> %1
3035}
3036declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
3037
3038define <8 x i32> @test_phaddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
3039; GENERIC-LABEL: test_phaddd:
3040; GENERIC:       # %bb.0:
3041; GENERIC-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
3042; GENERIC-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
3043; GENERIC-NEXT:    retq # sched: [1:1.00]
3044;
3045; HASWELL-LABEL: test_phaddd:
3046; HASWELL:       # %bb.0:
3047; HASWELL-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3048; HASWELL-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3049; HASWELL-NEXT:    retq # sched: [7:1.00]
3050;
3051; BROADWELL-LABEL: test_phaddd:
3052; BROADWELL:       # %bb.0:
3053; BROADWELL-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3054; BROADWELL-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
3055; BROADWELL-NEXT:    retq # sched: [7:1.00]
3056;
3057; SKYLAKE-LABEL: test_phaddd:
3058; SKYLAKE:       # %bb.0:
3059; SKYLAKE-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3060; SKYLAKE-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3061; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3062;
3063; SKX-LABEL: test_phaddd:
3064; SKX:       # %bb.0:
3065; SKX-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3066; SKX-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3067; SKX-NEXT:    retq # sched: [7:1.00]
3068;
3069; ZNVER1-LABEL: test_phaddd:
3070; ZNVER1:       # %bb.0:
3071; ZNVER1-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
3072; ZNVER1-NEXT:    vphaddd (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
3073; ZNVER1-NEXT:    retq # sched: [1:0.50]
3074  %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
3075  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
3076  %3 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %2)
3077  ret <8 x i32> %3
3078}
3079declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
3080
3081define <16 x i16> @test_phaddsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3082; GENERIC-LABEL: test_phaddsw:
3083; GENERIC:       # %bb.0:
3084; GENERIC-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
3085; GENERIC-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
3086; GENERIC-NEXT:    retq # sched: [1:1.00]
3087;
3088; HASWELL-LABEL: test_phaddsw:
3089; HASWELL:       # %bb.0:
3090; HASWELL-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3091; HASWELL-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3092; HASWELL-NEXT:    retq # sched: [7:1.00]
3093;
3094; BROADWELL-LABEL: test_phaddsw:
3095; BROADWELL:       # %bb.0:
3096; BROADWELL-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3097; BROADWELL-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
3098; BROADWELL-NEXT:    retq # sched: [7:1.00]
3099;
3100; SKYLAKE-LABEL: test_phaddsw:
3101; SKYLAKE:       # %bb.0:
3102; SKYLAKE-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3103; SKYLAKE-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3104; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3105;
3106; SKX-LABEL: test_phaddsw:
3107; SKX:       # %bb.0:
3108; SKX-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3109; SKX-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3110; SKX-NEXT:    retq # sched: [7:1.00]
3111;
3112; ZNVER1-LABEL: test_phaddsw:
3113; ZNVER1:       # %bb.0:
3114; ZNVER1-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
3115; ZNVER1-NEXT:    vphaddsw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
3116; ZNVER1-NEXT:    retq # sched: [1:0.50]
3117  %1 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1)
3118  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
3119  %3 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %1, <16 x i16> %2)
3120  ret <16 x i16> %3
3121}
3122declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
3123
3124define <16 x i16> @test_phaddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3125; GENERIC-LABEL: test_phaddw:
3126; GENERIC:       # %bb.0:
3127; GENERIC-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
3128; GENERIC-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
3129; GENERIC-NEXT:    retq # sched: [1:1.00]
3130;
3131; HASWELL-LABEL: test_phaddw:
3132; HASWELL:       # %bb.0:
3133; HASWELL-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3134; HASWELL-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3135; HASWELL-NEXT:    retq # sched: [7:1.00]
3136;
3137; BROADWELL-LABEL: test_phaddw:
3138; BROADWELL:       # %bb.0:
3139; BROADWELL-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3140; BROADWELL-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
3141; BROADWELL-NEXT:    retq # sched: [7:1.00]
3142;
3143; SKYLAKE-LABEL: test_phaddw:
3144; SKYLAKE:       # %bb.0:
3145; SKYLAKE-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3146; SKYLAKE-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3147; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3148;
3149; SKX-LABEL: test_phaddw:
3150; SKX:       # %bb.0:
3151; SKX-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3152; SKX-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3153; SKX-NEXT:    retq # sched: [7:1.00]
3154;
3155; ZNVER1-LABEL: test_phaddw:
3156; ZNVER1:       # %bb.0:
3157; ZNVER1-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
3158; ZNVER1-NEXT:    vphaddw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
3159; ZNVER1-NEXT:    retq # sched: [1:0.50]
3160  %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
3161  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
3162  %3 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2)
3163  ret <16 x i16> %3
3164}
3165declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
3166
3167define <8 x i32> @test_phsubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
3168; GENERIC-LABEL: test_phsubd:
3169; GENERIC:       # %bb.0:
3170; GENERIC-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
3171; GENERIC-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
3172; GENERIC-NEXT:    retq # sched: [1:1.00]
3173;
3174; HASWELL-LABEL: test_phsubd:
3175; HASWELL:       # %bb.0:
3176; HASWELL-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3177; HASWELL-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3178; HASWELL-NEXT:    retq # sched: [7:1.00]
3179;
3180; BROADWELL-LABEL: test_phsubd:
3181; BROADWELL:       # %bb.0:
3182; BROADWELL-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3183; BROADWELL-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
3184; BROADWELL-NEXT:    retq # sched: [7:1.00]
3185;
3186; SKYLAKE-LABEL: test_phsubd:
3187; SKYLAKE:       # %bb.0:
3188; SKYLAKE-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3189; SKYLAKE-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3190; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3191;
3192; SKX-LABEL: test_phsubd:
3193; SKX:       # %bb.0:
3194; SKX-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3195; SKX-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3196; SKX-NEXT:    retq # sched: [7:1.00]
3197;
3198; ZNVER1-LABEL: test_phsubd:
3199; ZNVER1:       # %bb.0:
3200; ZNVER1-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
3201; ZNVER1-NEXT:    vphsubd (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
3202; ZNVER1-NEXT:    retq # sched: [1:0.50]
3203  %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
3204  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
3205  %3 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %1, <8 x i32> %2)
3206  ret <8 x i32> %3
3207}
3208declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
3209
3210define <16 x i16> @test_phsubsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3211; GENERIC-LABEL: test_phsubsw:
3212; GENERIC:       # %bb.0:
3213; GENERIC-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
3214; GENERIC-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
3215; GENERIC-NEXT:    retq # sched: [1:1.00]
3216;
3217; HASWELL-LABEL: test_phsubsw:
3218; HASWELL:       # %bb.0:
3219; HASWELL-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3220; HASWELL-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3221; HASWELL-NEXT:    retq # sched: [7:1.00]
3222;
3223; BROADWELL-LABEL: test_phsubsw:
3224; BROADWELL:       # %bb.0:
3225; BROADWELL-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3226; BROADWELL-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
3227; BROADWELL-NEXT:    retq # sched: [7:1.00]
3228;
3229; SKYLAKE-LABEL: test_phsubsw:
3230; SKYLAKE:       # %bb.0:
3231; SKYLAKE-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3232; SKYLAKE-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3233; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3234;
3235; SKX-LABEL: test_phsubsw:
3236; SKX:       # %bb.0:
3237; SKX-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3238; SKX-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3239; SKX-NEXT:    retq # sched: [7:1.00]
3240;
3241; ZNVER1-LABEL: test_phsubsw:
3242; ZNVER1:       # %bb.0:
3243; ZNVER1-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
3244; ZNVER1-NEXT:    vphsubsw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
3245; ZNVER1-NEXT:    retq # sched: [1:0.50]
3246  %1 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1)
3247  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
3248  %3 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %1, <16 x i16> %2)
3249  ret <16 x i16> %3
3250}
3251declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
3252
3253define <16 x i16> @test_phsubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3254; GENERIC-LABEL: test_phsubw:
3255; GENERIC:       # %bb.0:
3256; GENERIC-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:1.50]
3257; GENERIC-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:1.50]
3258; GENERIC-NEXT:    retq # sched: [1:1.00]
3259;
3260; HASWELL-LABEL: test_phsubw:
3261; HASWELL:       # %bb.0:
3262; HASWELL-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3263; HASWELL-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3264; HASWELL-NEXT:    retq # sched: [7:1.00]
3265;
3266; BROADWELL-LABEL: test_phsubw:
3267; BROADWELL:       # %bb.0:
3268; BROADWELL-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3269; BROADWELL-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
3270; BROADWELL-NEXT:    retq # sched: [7:1.00]
3271;
3272; SKYLAKE-LABEL: test_phsubw:
3273; SKYLAKE:       # %bb.0:
3274; SKYLAKE-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3275; SKYLAKE-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3276; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3277;
3278; SKX-LABEL: test_phsubw:
3279; SKX:       # %bb.0:
3280; SKX-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
3281; SKX-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
3282; SKX-NEXT:    retq # sched: [7:1.00]
3283;
3284; ZNVER1-LABEL: test_phsubw:
3285; ZNVER1:       # %bb.0:
3286; ZNVER1-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 # sched: [100:0.25]
3287; ZNVER1-NEXT:    vphsubw (%rdi), %ymm0, %ymm0 # sched: [100:0.25]
3288; ZNVER1-NEXT:    retq # sched: [1:0.50]
3289  %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
3290  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
3291  %3 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %1, <16 x i16> %2)
3292  ret <16 x i16> %3
3293}
3294declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
3295
3296define <16 x i16> @test_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
3297; GENERIC-LABEL: test_pmaddubsw:
3298; GENERIC:       # %bb.0:
3299; GENERIC-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
3300; GENERIC-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
3301; GENERIC-NEXT:    retq # sched: [1:1.00]
3302;
3303; HASWELL-LABEL: test_pmaddubsw:
3304; HASWELL:       # %bb.0:
3305; HASWELL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
3306; HASWELL-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
3307; HASWELL-NEXT:    retq # sched: [7:1.00]
3308;
3309; BROADWELL-LABEL: test_pmaddubsw:
3310; BROADWELL:       # %bb.0:
3311; BROADWELL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
3312; BROADWELL-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
3313; BROADWELL-NEXT:    retq # sched: [7:1.00]
3314;
3315; SKYLAKE-LABEL: test_pmaddubsw:
3316; SKYLAKE:       # %bb.0:
3317; SKYLAKE-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
3318; SKYLAKE-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
3319; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3320;
3321; SKX-LABEL: test_pmaddubsw:
3322; SKX:       # %bb.0:
3323; SKX-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
3324; SKX-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
3325; SKX-NEXT:    retq # sched: [7:1.00]
3326;
3327; ZNVER1-LABEL: test_pmaddubsw:
3328; ZNVER1:       # %bb.0:
3329; ZNVER1-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
3330; ZNVER1-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
3331; ZNVER1-NEXT:    retq # sched: [1:0.50]
3332  %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
3333  %2 = bitcast <16 x i16> %1 to <32 x i8>
3334  %3 = load <32 x i8>, <32 x i8> *%a2, align 32
3335  %4 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %2, <32 x i8> %3)
3336  ret <16 x i16> %4
3337}
3338declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
3339
3340define <8 x i32> @test_pmaddwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3341; GENERIC-LABEL: test_pmaddwd:
3342; GENERIC:       # %bb.0:
3343; GENERIC-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
3344; GENERIC-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
3345; GENERIC-NEXT:    retq # sched: [1:1.00]
3346;
3347; HASWELL-LABEL: test_pmaddwd:
3348; HASWELL:       # %bb.0:
3349; HASWELL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
3350; HASWELL-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
3351; HASWELL-NEXT:    retq # sched: [7:1.00]
3352;
3353; BROADWELL-LABEL: test_pmaddwd:
3354; BROADWELL:       # %bb.0:
3355; BROADWELL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
3356; BROADWELL-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
3357; BROADWELL-NEXT:    retq # sched: [7:1.00]
3358;
3359; SKYLAKE-LABEL: test_pmaddwd:
3360; SKYLAKE:       # %bb.0:
3361; SKYLAKE-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
3362; SKYLAKE-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
3363; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3364;
3365; SKX-LABEL: test_pmaddwd:
3366; SKX:       # %bb.0:
3367; SKX-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
3368; SKX-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
3369; SKX-NEXT:    retq # sched: [7:1.00]
3370;
3371; ZNVER1-LABEL: test_pmaddwd:
3372; ZNVER1:       # %bb.0:
3373; ZNVER1-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
3374; ZNVER1-NEXT:    vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
3375; ZNVER1-NEXT:    retq # sched: [1:0.50]
3376  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
3377  %2 = bitcast <8 x i32> %1 to <16 x i16>
3378  %3 = load <16 x i16>, <16 x i16> *%a2, align 32
3379  %4 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %2, <16 x i16> %3)
3380  ret <8 x i32> %4
3381}
3382declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
3383
3384define <4 x i32> @test_pmaskmovd(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
3385; GENERIC-LABEL: test_pmaskmovd:
3386; GENERIC:       # %bb.0:
3387; GENERIC-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
3388; GENERIC-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
3389; GENERIC-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3390; GENERIC-NEXT:    retq # sched: [1:1.00]
3391;
3392; HASWELL-LABEL: test_pmaskmovd:
3393; HASWELL:       # %bb.0:
3394; HASWELL-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
3395; HASWELL-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
3396; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3397; HASWELL-NEXT:    retq # sched: [7:1.00]
3398;
3399; BROADWELL-LABEL: test_pmaskmovd:
3400; BROADWELL:       # %bb.0:
3401; BROADWELL-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
3402; BROADWELL-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
3403; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3404; BROADWELL-NEXT:    retq # sched: [7:1.00]
3405;
3406; SKYLAKE-LABEL: test_pmaskmovd:
3407; SKYLAKE:       # %bb.0:
3408; SKYLAKE-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
3409; SKYLAKE-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
3410; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3411; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3412;
3413; SKX-LABEL: test_pmaskmovd:
3414; SKX:       # %bb.0:
3415; SKX-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
3416; SKX-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
3417; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3418; SKX-NEXT:    retq # sched: [7:1.00]
3419;
3420; ZNVER1-LABEL: test_pmaskmovd:
3421; ZNVER1:       # %bb.0:
3422; ZNVER1-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [100:0.25]
3423; ZNVER1-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [100:0.25]
3424; ZNVER1-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
3425; ZNVER1-NEXT:    retq # sched: [1:0.50]
3426  %1 = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1)
3427  call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2)
3428  ret <4 x i32> %1
3429}
3430declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
3431declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
3432
3433define <8 x i32> @test_pmaskmovd_ymm(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
3434; GENERIC-LABEL: test_pmaskmovd_ymm:
3435; GENERIC:       # %bb.0:
3436; GENERIC-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
3437; GENERIC-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
3438; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
3439; GENERIC-NEXT:    retq # sched: [1:1.00]
3440;
3441; HASWELL-LABEL: test_pmaskmovd_ymm:
3442; HASWELL:       # %bb.0:
3443; HASWELL-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
3444; HASWELL-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
3445; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3446; HASWELL-NEXT:    retq # sched: [7:1.00]
3447;
3448; BROADWELL-LABEL: test_pmaskmovd_ymm:
3449; BROADWELL:       # %bb.0:
3450; BROADWELL-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
3451; BROADWELL-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
3452; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3453; BROADWELL-NEXT:    retq # sched: [7:1.00]
3454;
3455; SKYLAKE-LABEL: test_pmaskmovd_ymm:
3456; SKYLAKE:       # %bb.0:
3457; SKYLAKE-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
3458; SKYLAKE-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
3459; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3460; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3461;
3462; SKX-LABEL: test_pmaskmovd_ymm:
3463; SKX:       # %bb.0:
3464; SKX-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
3465; SKX-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
3466; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3467; SKX-NEXT:    retq # sched: [7:1.00]
3468;
3469; ZNVER1-LABEL: test_pmaskmovd_ymm:
3470; ZNVER1:       # %bb.0:
3471; ZNVER1-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [100:0.25]
3472; ZNVER1-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [100:0.25]
3473; ZNVER1-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [2:0.25]
3474; ZNVER1-NEXT:    retq # sched: [1:0.50]
3475  %1 = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1)
3476  call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
3477  ret <8 x i32> %1
3478}
3479declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
3480declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
3481
3482define <2 x i64> @test_pmaskmovq(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
3483; GENERIC-LABEL: test_pmaskmovq:
3484; GENERIC:       # %bb.0:
3485; GENERIC-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
3486; GENERIC-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
3487; GENERIC-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3488; GENERIC-NEXT:    retq # sched: [1:1.00]
3489;
3490; HASWELL-LABEL: test_pmaskmovq:
3491; HASWELL:       # %bb.0:
3492; HASWELL-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
3493; HASWELL-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
3494; HASWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3495; HASWELL-NEXT:    retq # sched: [7:1.00]
3496;
3497; BROADWELL-LABEL: test_pmaskmovq:
3498; BROADWELL:       # %bb.0:
3499; BROADWELL-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
3500; BROADWELL-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
3501; BROADWELL-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3502; BROADWELL-NEXT:    retq # sched: [7:1.00]
3503;
3504; SKYLAKE-LABEL: test_pmaskmovq:
3505; SKYLAKE:       # %bb.0:
3506; SKYLAKE-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
3507; SKYLAKE-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
3508; SKYLAKE-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3509; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3510;
3511; SKX-LABEL: test_pmaskmovq:
3512; SKX:       # %bb.0:
3513; SKX-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
3514; SKX-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
3515; SKX-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
3516; SKX-NEXT:    retq # sched: [7:1.00]
3517;
3518; ZNVER1-LABEL: test_pmaskmovq:
3519; ZNVER1:       # %bb.0:
3520; ZNVER1-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
3521; ZNVER1-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [100:0.25]
3522; ZNVER1-NEXT:    vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
3523; ZNVER1-NEXT:    retq # sched: [1:0.50]
3524  %1 = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1)
3525  call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2)
3526  ret <2 x i64> %1
3527}
3528declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
3529declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
3530
3531define <4 x i64> @test_pmaskmovq_ymm(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
3532; GENERIC-LABEL: test_pmaskmovq_ymm:
3533; GENERIC:       # %bb.0:
3534; GENERIC-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
3535; GENERIC-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
3536; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
3537; GENERIC-NEXT:    retq # sched: [1:1.00]
3538;
3539; HASWELL-LABEL: test_pmaskmovq_ymm:
3540; HASWELL:       # %bb.0:
3541; HASWELL-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
3542; HASWELL-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
3543; HASWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3544; HASWELL-NEXT:    retq # sched: [7:1.00]
3545;
3546; BROADWELL-LABEL: test_pmaskmovq_ymm:
3547; BROADWELL:       # %bb.0:
3548; BROADWELL-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
3549; BROADWELL-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
3550; BROADWELL-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3551; BROADWELL-NEXT:    retq # sched: [7:1.00]
3552;
3553; SKYLAKE-LABEL: test_pmaskmovq_ymm:
3554; SKYLAKE:       # %bb.0:
3555; SKYLAKE-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
3556; SKYLAKE-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
3557; SKYLAKE-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3558; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3559;
3560; SKX-LABEL: test_pmaskmovq_ymm:
3561; SKX:       # %bb.0:
3562; SKX-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
3563; SKX-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
3564; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
3565; SKX-NEXT:    retq # sched: [7:1.00]
3566;
3567; ZNVER1-LABEL: test_pmaskmovq_ymm:
3568; ZNVER1:       # %bb.0:
3569; ZNVER1-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:1.50]
3570; ZNVER1-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [100:0.25]
3571; ZNVER1-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [2:0.25]
3572; ZNVER1-NEXT:    retq # sched: [1:0.50]
3573  %1 = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1)
3574  call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
3575  ret <4 x i64> %1
3576}
3577declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
3578declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
3579
3580define <32 x i8> @test_pmaxsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
3581; GENERIC-LABEL: test_pmaxsb:
3582; GENERIC:       # %bb.0:
3583; GENERIC-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3584; GENERIC-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3585; GENERIC-NEXT:    retq # sched: [1:1.00]
3586;
3587; HASWELL-LABEL: test_pmaxsb:
3588; HASWELL:       # %bb.0:
3589; HASWELL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3590; HASWELL-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3591; HASWELL-NEXT:    retq # sched: [7:1.00]
3592;
3593; BROADWELL-LABEL: test_pmaxsb:
3594; BROADWELL:       # %bb.0:
3595; BROADWELL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3596; BROADWELL-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3597; BROADWELL-NEXT:    retq # sched: [7:1.00]
3598;
3599; SKYLAKE-LABEL: test_pmaxsb:
3600; SKYLAKE:       # %bb.0:
3601; SKYLAKE-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3602; SKYLAKE-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3603; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3604;
3605; SKX-LABEL: test_pmaxsb:
3606; SKX:       # %bb.0:
3607; SKX-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3608; SKX-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3609; SKX-NEXT:    retq # sched: [7:1.00]
3610;
3611; ZNVER1-LABEL: test_pmaxsb:
3612; ZNVER1:       # %bb.0:
3613; ZNVER1-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3614; ZNVER1-NEXT:    vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3615; ZNVER1-NEXT:    retq # sched: [1:0.50]
3616  %1 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
3617  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
3618  %3 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %1, <32 x i8> %2)
3619  ret <32 x i8> %3
3620}
3621declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
3622
3623define <8 x i32> @test_pmaxsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
3624; GENERIC-LABEL: test_pmaxsd:
3625; GENERIC:       # %bb.0:
3626; GENERIC-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3627; GENERIC-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3628; GENERIC-NEXT:    retq # sched: [1:1.00]
3629;
3630; HASWELL-LABEL: test_pmaxsd:
3631; HASWELL:       # %bb.0:
3632; HASWELL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3633; HASWELL-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3634; HASWELL-NEXT:    retq # sched: [7:1.00]
3635;
3636; BROADWELL-LABEL: test_pmaxsd:
3637; BROADWELL:       # %bb.0:
3638; BROADWELL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3639; BROADWELL-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3640; BROADWELL-NEXT:    retq # sched: [7:1.00]
3641;
3642; SKYLAKE-LABEL: test_pmaxsd:
3643; SKYLAKE:       # %bb.0:
3644; SKYLAKE-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3645; SKYLAKE-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3646; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3647;
3648; SKX-LABEL: test_pmaxsd:
3649; SKX:       # %bb.0:
3650; SKX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3651; SKX-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3652; SKX-NEXT:    retq # sched: [7:1.00]
3653;
3654; ZNVER1-LABEL: test_pmaxsd:
3655; ZNVER1:       # %bb.0:
3656; ZNVER1-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3657; ZNVER1-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3658; ZNVER1-NEXT:    retq # sched: [1:0.50]
3659  %1 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
3660  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
3661  %3 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %1, <8 x i32> %2)
3662  ret <8 x i32> %3
3663}
3664declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
3665
3666define <16 x i16> @test_pmaxsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3667; GENERIC-LABEL: test_pmaxsw:
3668; GENERIC:       # %bb.0:
3669; GENERIC-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3670; GENERIC-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3671; GENERIC-NEXT:    retq # sched: [1:1.00]
3672;
3673; HASWELL-LABEL: test_pmaxsw:
3674; HASWELL:       # %bb.0:
3675; HASWELL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3676; HASWELL-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3677; HASWELL-NEXT:    retq # sched: [7:1.00]
3678;
3679; BROADWELL-LABEL: test_pmaxsw:
3680; BROADWELL:       # %bb.0:
3681; BROADWELL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3682; BROADWELL-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3683; BROADWELL-NEXT:    retq # sched: [7:1.00]
3684;
3685; SKYLAKE-LABEL: test_pmaxsw:
3686; SKYLAKE:       # %bb.0:
3687; SKYLAKE-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3688; SKYLAKE-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3689; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3690;
3691; SKX-LABEL: test_pmaxsw:
3692; SKX:       # %bb.0:
3693; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3694; SKX-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3695; SKX-NEXT:    retq # sched: [7:1.00]
3696;
3697; ZNVER1-LABEL: test_pmaxsw:
3698; ZNVER1:       # %bb.0:
3699; ZNVER1-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3700; ZNVER1-NEXT:    vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3701; ZNVER1-NEXT:    retq # sched: [1:0.50]
3702  %1 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
3703  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
3704  %3 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %1, <16 x i16> %2)
3705  ret <16 x i16> %3
3706}
3707declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
3708
3709define <32 x i8> @test_pmaxub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
3710; GENERIC-LABEL: test_pmaxub:
3711; GENERIC:       # %bb.0:
3712; GENERIC-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3713; GENERIC-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3714; GENERIC-NEXT:    retq # sched: [1:1.00]
3715;
3716; HASWELL-LABEL: test_pmaxub:
3717; HASWELL:       # %bb.0:
3718; HASWELL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3719; HASWELL-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3720; HASWELL-NEXT:    retq # sched: [7:1.00]
3721;
3722; BROADWELL-LABEL: test_pmaxub:
3723; BROADWELL:       # %bb.0:
3724; BROADWELL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3725; BROADWELL-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3726; BROADWELL-NEXT:    retq # sched: [7:1.00]
3727;
3728; SKYLAKE-LABEL: test_pmaxub:
3729; SKYLAKE:       # %bb.0:
3730; SKYLAKE-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3731; SKYLAKE-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3732; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3733;
3734; SKX-LABEL: test_pmaxub:
3735; SKX:       # %bb.0:
3736; SKX-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3737; SKX-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3738; SKX-NEXT:    retq # sched: [7:1.00]
3739;
3740; ZNVER1-LABEL: test_pmaxub:
3741; ZNVER1:       # %bb.0:
3742; ZNVER1-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3743; ZNVER1-NEXT:    vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3744; ZNVER1-NEXT:    retq # sched: [1:0.50]
3745  %1 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
3746  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
3747  %3 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %1, <32 x i8> %2)
3748  ret <32 x i8> %3
3749}
3750declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
3751
3752define <8 x i32> @test_pmaxud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
3753; GENERIC-LABEL: test_pmaxud:
3754; GENERIC:       # %bb.0:
3755; GENERIC-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3756; GENERIC-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3757; GENERIC-NEXT:    retq # sched: [1:1.00]
3758;
3759; HASWELL-LABEL: test_pmaxud:
3760; HASWELL:       # %bb.0:
3761; HASWELL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3762; HASWELL-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3763; HASWELL-NEXT:    retq # sched: [7:1.00]
3764;
3765; BROADWELL-LABEL: test_pmaxud:
3766; BROADWELL:       # %bb.0:
3767; BROADWELL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3768; BROADWELL-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3769; BROADWELL-NEXT:    retq # sched: [7:1.00]
3770;
3771; SKYLAKE-LABEL: test_pmaxud:
3772; SKYLAKE:       # %bb.0:
3773; SKYLAKE-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3774; SKYLAKE-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3775; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3776;
3777; SKX-LABEL: test_pmaxud:
3778; SKX:       # %bb.0:
3779; SKX-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3780; SKX-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3781; SKX-NEXT:    retq # sched: [7:1.00]
3782;
3783; ZNVER1-LABEL: test_pmaxud:
3784; ZNVER1:       # %bb.0:
3785; ZNVER1-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3786; ZNVER1-NEXT:    vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3787; ZNVER1-NEXT:    retq # sched: [1:0.50]
3788  %1 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
3789  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
3790  %3 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %1, <8 x i32> %2)
3791  ret <8 x i32> %3
3792}
3793declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
3794
3795define <16 x i16> @test_pmaxuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3796; GENERIC-LABEL: test_pmaxuw:
3797; GENERIC:       # %bb.0:
3798; GENERIC-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3799; GENERIC-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3800; GENERIC-NEXT:    retq # sched: [1:1.00]
3801;
3802; HASWELL-LABEL: test_pmaxuw:
3803; HASWELL:       # %bb.0:
3804; HASWELL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3805; HASWELL-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3806; HASWELL-NEXT:    retq # sched: [7:1.00]
3807;
3808; BROADWELL-LABEL: test_pmaxuw:
3809; BROADWELL:       # %bb.0:
3810; BROADWELL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3811; BROADWELL-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3812; BROADWELL-NEXT:    retq # sched: [7:1.00]
3813;
3814; SKYLAKE-LABEL: test_pmaxuw:
3815; SKYLAKE:       # %bb.0:
3816; SKYLAKE-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3817; SKYLAKE-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3818; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3819;
3820; SKX-LABEL: test_pmaxuw:
3821; SKX:       # %bb.0:
3822; SKX-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3823; SKX-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3824; SKX-NEXT:    retq # sched: [7:1.00]
3825;
3826; ZNVER1-LABEL: test_pmaxuw:
3827; ZNVER1:       # %bb.0:
3828; ZNVER1-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3829; ZNVER1-NEXT:    vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3830; ZNVER1-NEXT:    retq # sched: [1:0.50]
3831  %1 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
3832  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
3833  %3 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %1, <16 x i16> %2)
3834  ret <16 x i16> %3
3835}
3836declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
3837
3838define <32 x i8> @test_pminsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
3839; GENERIC-LABEL: test_pminsb:
3840; GENERIC:       # %bb.0:
3841; GENERIC-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3842; GENERIC-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3843; GENERIC-NEXT:    retq # sched: [1:1.00]
3844;
3845; HASWELL-LABEL: test_pminsb:
3846; HASWELL:       # %bb.0:
3847; HASWELL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3848; HASWELL-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3849; HASWELL-NEXT:    retq # sched: [7:1.00]
3850;
3851; BROADWELL-LABEL: test_pminsb:
3852; BROADWELL:       # %bb.0:
3853; BROADWELL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3854; BROADWELL-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3855; BROADWELL-NEXT:    retq # sched: [7:1.00]
3856;
3857; SKYLAKE-LABEL: test_pminsb:
3858; SKYLAKE:       # %bb.0:
3859; SKYLAKE-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3860; SKYLAKE-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3861; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3862;
3863; SKX-LABEL: test_pminsb:
3864; SKX:       # %bb.0:
3865; SKX-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3866; SKX-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3867; SKX-NEXT:    retq # sched: [7:1.00]
3868;
3869; ZNVER1-LABEL: test_pminsb:
3870; ZNVER1:       # %bb.0:
3871; ZNVER1-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3872; ZNVER1-NEXT:    vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3873; ZNVER1-NEXT:    retq # sched: [1:0.50]
3874  %1 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
3875  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
3876  %3 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %1, <32 x i8> %2)
3877  ret <32 x i8> %3
3878}
3879declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
3880
3881define <8 x i32> @test_pminsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
3882; GENERIC-LABEL: test_pminsd:
3883; GENERIC:       # %bb.0:
3884; GENERIC-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3885; GENERIC-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3886; GENERIC-NEXT:    retq # sched: [1:1.00]
3887;
3888; HASWELL-LABEL: test_pminsd:
3889; HASWELL:       # %bb.0:
3890; HASWELL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3891; HASWELL-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3892; HASWELL-NEXT:    retq # sched: [7:1.00]
3893;
3894; BROADWELL-LABEL: test_pminsd:
3895; BROADWELL:       # %bb.0:
3896; BROADWELL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3897; BROADWELL-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3898; BROADWELL-NEXT:    retq # sched: [7:1.00]
3899;
3900; SKYLAKE-LABEL: test_pminsd:
3901; SKYLAKE:       # %bb.0:
3902; SKYLAKE-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3903; SKYLAKE-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3904; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3905;
3906; SKX-LABEL: test_pminsd:
3907; SKX:       # %bb.0:
3908; SKX-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3909; SKX-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3910; SKX-NEXT:    retq # sched: [7:1.00]
3911;
3912; ZNVER1-LABEL: test_pminsd:
3913; ZNVER1:       # %bb.0:
3914; ZNVER1-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3915; ZNVER1-NEXT:    vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3916; ZNVER1-NEXT:    retq # sched: [1:0.50]
3917  %1 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
3918  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
3919  %3 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %1, <8 x i32> %2)
3920  ret <8 x i32> %3
3921}
3922declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
3923
3924define <16 x i16> @test_pminsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
3925; GENERIC-LABEL: test_pminsw:
3926; GENERIC:       # %bb.0:
3927; GENERIC-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3928; GENERIC-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3929; GENERIC-NEXT:    retq # sched: [1:1.00]
3930;
3931; HASWELL-LABEL: test_pminsw:
3932; HASWELL:       # %bb.0:
3933; HASWELL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3934; HASWELL-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3935; HASWELL-NEXT:    retq # sched: [7:1.00]
3936;
3937; BROADWELL-LABEL: test_pminsw:
3938; BROADWELL:       # %bb.0:
3939; BROADWELL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3940; BROADWELL-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3941; BROADWELL-NEXT:    retq # sched: [7:1.00]
3942;
3943; SKYLAKE-LABEL: test_pminsw:
3944; SKYLAKE:       # %bb.0:
3945; SKYLAKE-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3946; SKYLAKE-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3947; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3948;
3949; SKX-LABEL: test_pminsw:
3950; SKX:       # %bb.0:
3951; SKX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3952; SKX-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3953; SKX-NEXT:    retq # sched: [7:1.00]
3954;
3955; ZNVER1-LABEL: test_pminsw:
3956; ZNVER1:       # %bb.0:
3957; ZNVER1-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
3958; ZNVER1-NEXT:    vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3959; ZNVER1-NEXT:    retq # sched: [1:0.50]
3960  %1 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
3961  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
3962  %3 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %1, <16 x i16> %2)
3963  ret <16 x i16> %3
3964}
3965declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
3966
3967define <32 x i8> @test_pminub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
3968; GENERIC-LABEL: test_pminub:
3969; GENERIC:       # %bb.0:
3970; GENERIC-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3971; GENERIC-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3972; GENERIC-NEXT:    retq # sched: [1:1.00]
3973;
3974; HASWELL-LABEL: test_pminub:
3975; HASWELL:       # %bb.0:
3976; HASWELL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3977; HASWELL-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3978; HASWELL-NEXT:    retq # sched: [7:1.00]
3979;
3980; BROADWELL-LABEL: test_pminub:
3981; BROADWELL:       # %bb.0:
3982; BROADWELL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3983; BROADWELL-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
3984; BROADWELL-NEXT:    retq # sched: [7:1.00]
3985;
3986; SKYLAKE-LABEL: test_pminub:
3987; SKYLAKE:       # %bb.0:
3988; SKYLAKE-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3989; SKYLAKE-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3990; SKYLAKE-NEXT:    retq # sched: [7:1.00]
3991;
3992; SKX-LABEL: test_pminub:
3993; SKX:       # %bb.0:
3994; SKX-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
3995; SKX-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
3996; SKX-NEXT:    retq # sched: [7:1.00]
3997;
3998; ZNVER1-LABEL: test_pminub:
3999; ZNVER1:       # %bb.0:
4000; ZNVER1-NEXT:    vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4001; ZNVER1-NEXT:    vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4002; ZNVER1-NEXT:    retq # sched: [1:0.50]
4003  %1 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
4004  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
4005  %3 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %1, <32 x i8> %2)
4006  ret <32 x i8> %3
4007}
4008declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
4009
4010define <8 x i32> @test_pminud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
4011; GENERIC-LABEL: test_pminud:
4012; GENERIC:       # %bb.0:
4013; GENERIC-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4014; GENERIC-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4015; GENERIC-NEXT:    retq # sched: [1:1.00]
4016;
4017; HASWELL-LABEL: test_pminud:
4018; HASWELL:       # %bb.0:
4019; HASWELL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4020; HASWELL-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4021; HASWELL-NEXT:    retq # sched: [7:1.00]
4022;
4023; BROADWELL-LABEL: test_pminud:
4024; BROADWELL:       # %bb.0:
4025; BROADWELL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4026; BROADWELL-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
4027; BROADWELL-NEXT:    retq # sched: [7:1.00]
4028;
4029; SKYLAKE-LABEL: test_pminud:
4030; SKYLAKE:       # %bb.0:
4031; SKYLAKE-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4032; SKYLAKE-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4033; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4034;
4035; SKX-LABEL: test_pminud:
4036; SKX:       # %bb.0:
4037; SKX-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4038; SKX-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4039; SKX-NEXT:    retq # sched: [7:1.00]
4040;
4041; ZNVER1-LABEL: test_pminud:
4042; ZNVER1:       # %bb.0:
4043; ZNVER1-NEXT:    vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4044; ZNVER1-NEXT:    vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4045; ZNVER1-NEXT:    retq # sched: [1:0.50]
4046  %1 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
4047  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
4048  %3 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %1, <8 x i32> %2)
4049  ret <8 x i32> %3
4050}
4051declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
4052
4053define <16 x i16> @test_pminuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
4054; GENERIC-LABEL: test_pminuw:
4055; GENERIC:       # %bb.0:
4056; GENERIC-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4057; GENERIC-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4058; GENERIC-NEXT:    retq # sched: [1:1.00]
4059;
4060; HASWELL-LABEL: test_pminuw:
4061; HASWELL:       # %bb.0:
4062; HASWELL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4063; HASWELL-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4064; HASWELL-NEXT:    retq # sched: [7:1.00]
4065;
4066; BROADWELL-LABEL: test_pminuw:
4067; BROADWELL:       # %bb.0:
4068; BROADWELL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4069; BROADWELL-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
4070; BROADWELL-NEXT:    retq # sched: [7:1.00]
4071;
4072; SKYLAKE-LABEL: test_pminuw:
4073; SKYLAKE:       # %bb.0:
4074; SKYLAKE-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4075; SKYLAKE-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4076; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4077;
4078; SKX-LABEL: test_pminuw:
4079; SKX:       # %bb.0:
4080; SKX-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4081; SKX-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4082; SKX-NEXT:    retq # sched: [7:1.00]
4083;
4084; ZNVER1-LABEL: test_pminuw:
4085; ZNVER1:       # %bb.0:
4086; ZNVER1-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4087; ZNVER1-NEXT:    vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
4088; ZNVER1-NEXT:    retq # sched: [1:0.50]
4089  %1 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
4090  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
4091  %3 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %1, <16 x i16> %2)
4092  ret <16 x i16> %3
4093}
4094declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
4095
4096define i32 @test_pmovmskb(<32 x i8> %a0) {
4097; GENERIC-LABEL: test_pmovmskb:
4098; GENERIC:       # %bb.0:
4099; GENERIC-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
4100; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
4101; GENERIC-NEXT:    retq # sched: [1:1.00]
4102;
4103; HASWELL-LABEL: test_pmovmskb:
4104; HASWELL:       # %bb.0:
4105; HASWELL-NEXT:    vpmovmskb %ymm0, %eax # sched: [3:1.00]
4106; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
4107; HASWELL-NEXT:    retq # sched: [7:1.00]
4108;
4109; BROADWELL-LABEL: test_pmovmskb:
4110; BROADWELL:       # %bb.0:
4111; BROADWELL-NEXT:    vpmovmskb %ymm0, %eax # sched: [3:1.00]
4112; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
4113; BROADWELL-NEXT:    retq # sched: [7:1.00]
4114;
4115; SKYLAKE-LABEL: test_pmovmskb:
4116; SKYLAKE:       # %bb.0:
4117; SKYLAKE-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
4118; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
4119; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4120;
4121; SKX-LABEL: test_pmovmskb:
4122; SKX:       # %bb.0:
4123; SKX-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:1.00]
4124; SKX-NEXT:    vzeroupper # sched: [4:1.00]
4125; SKX-NEXT:    retq # sched: [7:1.00]
4126;
4127; ZNVER1-LABEL: test_pmovmskb:
4128; ZNVER1:       # %bb.0:
4129; ZNVER1-NEXT:    vpmovmskb %ymm0, %eax # sched: [2:2.00]
4130; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
4131; ZNVER1-NEXT:    retq # sched: [1:0.50]
4132  %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0)
4133  ret i32 %1
4134}
4135declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
4136
4137define <8 x i32> @test_pmovsxbd(<16 x i8> %a0, <16 x i8> *%a1) {
4138; GENERIC-LABEL: test_pmovsxbd:
4139; GENERIC:       # %bb.0:
4140; GENERIC-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [1:1.00]
4141; GENERIC-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
4142; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4143; GENERIC-NEXT:    retq # sched: [1:1.00]
4144;
4145; HASWELL-LABEL: test_pmovsxbd:
4146; HASWELL:       # %bb.0:
4147; HASWELL-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
4148; HASWELL-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
4149; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4150; HASWELL-NEXT:    retq # sched: [7:1.00]
4151;
4152; BROADWELL-LABEL: test_pmovsxbd:
4153; BROADWELL:       # %bb.0:
4154; BROADWELL-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
4155; BROADWELL-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
4156; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4157; BROADWELL-NEXT:    retq # sched: [7:1.00]
4158;
4159; SKYLAKE-LABEL: test_pmovsxbd:
4160; SKYLAKE:       # %bb.0:
4161; SKYLAKE-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
4162; SKYLAKE-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
4163; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4164; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4165;
4166; SKX-LABEL: test_pmovsxbd:
4167; SKX:       # %bb.0:
4168; SKX-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
4169; SKX-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
4170; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4171; SKX-NEXT:    retq # sched: [7:1.00]
4172;
4173; ZNVER1-LABEL: test_pmovsxbd:
4174; ZNVER1:       # %bb.0:
4175; ZNVER1-NEXT:    vpmovsxbd (%rdi), %ymm1 # sched: [8:0.50]
4176; ZNVER1-NEXT:    vpmovsxbd %xmm0, %ymm0 # sched: [1:0.50]
4177; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4178; ZNVER1-NEXT:    retq # sched: [1:0.50]
4179  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4180  %2 = sext <8 x i8> %1 to <8 x i32>
4181  %3 = load <16 x i8>, <16 x i8> *%a1, align 16
4182  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4183  %5 = sext <8 x i8> %4 to <8 x i32>
4184  %6 = add <8 x i32> %2, %5
4185  ret <8 x i32> %6
4186}
4187
4188define <4 x i64> @test_pmovsxbq(<16 x i8> %a0, <16 x i8> *%a1) {
4189; GENERIC-LABEL: test_pmovsxbq:
4190; GENERIC:       # %bb.0:
4191; GENERIC-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [1:1.00]
4192; GENERIC-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
4193; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4194; GENERIC-NEXT:    retq # sched: [1:1.00]
4195;
4196; HASWELL-LABEL: test_pmovsxbq:
4197; HASWELL:       # %bb.0:
4198; HASWELL-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
4199; HASWELL-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
4200; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4201; HASWELL-NEXT:    retq # sched: [7:1.00]
4202;
4203; BROADWELL-LABEL: test_pmovsxbq:
4204; BROADWELL:       # %bb.0:
4205; BROADWELL-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
4206; BROADWELL-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
4207; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4208; BROADWELL-NEXT:    retq # sched: [7:1.00]
4209;
4210; SKYLAKE-LABEL: test_pmovsxbq:
4211; SKYLAKE:       # %bb.0:
4212; SKYLAKE-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
4213; SKYLAKE-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
4214; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4215; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4216;
4217; SKX-LABEL: test_pmovsxbq:
4218; SKX:       # %bb.0:
4219; SKX-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
4220; SKX-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
4221; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4222; SKX-NEXT:    retq # sched: [7:1.00]
4223;
4224; ZNVER1-LABEL: test_pmovsxbq:
4225; ZNVER1:       # %bb.0:
4226; ZNVER1-NEXT:    vpmovsxbq (%rdi), %ymm1 # sched: [8:0.50]
4227; ZNVER1-NEXT:    vpmovsxbq %xmm0, %ymm0 # sched: [1:0.50]
4228; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4229; ZNVER1-NEXT:    retq # sched: [1:0.50]
4230  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4231  %2 = sext <4 x i8> %1 to <4 x i64>
4232  %3 = load <16 x i8>, <16 x i8> *%a1, align 16
4233  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4234  %5 = sext <4 x i8> %4 to <4 x i64>
4235  %6 = add <4 x i64> %2, %5
4236  ret <4 x i64> %6
4237}
4238
4239define <16 x i16> @test_pmovsxbw(<16 x i8> %a0, <16 x i8> *%a1) {
4240; GENERIC-LABEL: test_pmovsxbw:
4241; GENERIC:       # %bb.0:
4242; GENERIC-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [1:1.00]
4243; GENERIC-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [8:1.00]
4244; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4245; GENERIC-NEXT:    retq # sched: [1:1.00]
4246;
4247; HASWELL-LABEL: test_pmovsxbw:
4248; HASWELL:       # %bb.0:
4249; HASWELL-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
4250; HASWELL-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
4251; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4252; HASWELL-NEXT:    retq # sched: [7:1.00]
4253;
4254; BROADWELL-LABEL: test_pmovsxbw:
4255; BROADWELL:       # %bb.0:
4256; BROADWELL-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
4257; BROADWELL-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [8:1.00]
4258; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4259; BROADWELL-NEXT:    retq # sched: [7:1.00]
4260;
4261; SKYLAKE-LABEL: test_pmovsxbw:
4262; SKYLAKE:       # %bb.0:
4263; SKYLAKE-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
4264; SKYLAKE-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
4265; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4266; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4267;
4268; SKX-LABEL: test_pmovsxbw:
4269; SKX:       # %bb.0:
4270; SKX-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
4271; SKX-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
4272; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4273; SKX-NEXT:    retq # sched: [7:1.00]
4274;
4275; ZNVER1-LABEL: test_pmovsxbw:
4276; ZNVER1:       # %bb.0:
4277; ZNVER1-NEXT:    vpmovsxbw (%rdi), %ymm1 # sched: [8:0.50]
4278; ZNVER1-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [1:0.50]
4279; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4280; ZNVER1-NEXT:    retq # sched: [1:0.50]
4281  %1 = sext <16 x i8> %a0 to <16 x i16>
4282  %2 = load <16 x i8>, <16 x i8> *%a1, align 16
4283  %3 = sext <16 x i8> %2 to <16 x i16>
4284  %4 = add <16 x i16> %1, %3
4285  ret <16 x i16> %4
4286}
4287
4288define <4 x i64> @test_pmovsxdq(<4 x i32> %a0, <4 x i32> *%a1) {
4289; GENERIC-LABEL: test_pmovsxdq:
4290; GENERIC:       # %bb.0:
4291; GENERIC-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [1:1.00]
4292; GENERIC-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [8:1.00]
4293; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4294; GENERIC-NEXT:    retq # sched: [1:1.00]
4295;
4296; HASWELL-LABEL: test_pmovsxdq:
4297; HASWELL:       # %bb.0:
4298; HASWELL-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
4299; HASWELL-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
4300; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4301; HASWELL-NEXT:    retq # sched: [7:1.00]
4302;
4303; BROADWELL-LABEL: test_pmovsxdq:
4304; BROADWELL:       # %bb.0:
4305; BROADWELL-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
4306; BROADWELL-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [8:1.00]
4307; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4308; BROADWELL-NEXT:    retq # sched: [7:1.00]
4309;
4310; SKYLAKE-LABEL: test_pmovsxdq:
4311; SKYLAKE:       # %bb.0:
4312; SKYLAKE-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
4313; SKYLAKE-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
4314; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4315; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4316;
4317; SKX-LABEL: test_pmovsxdq:
4318; SKX:       # %bb.0:
4319; SKX-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
4320; SKX-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
4321; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4322; SKX-NEXT:    retq # sched: [7:1.00]
4323;
4324; ZNVER1-LABEL: test_pmovsxdq:
4325; ZNVER1:       # %bb.0:
4326; ZNVER1-NEXT:    vpmovsxdq (%rdi), %ymm1 # sched: [8:0.50]
4327; ZNVER1-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [1:0.50]
4328; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4329; ZNVER1-NEXT:    retq # sched: [1:0.50]
4330  %1 = sext <4 x i32> %a0 to <4 x i64>
4331  %2 = load <4 x i32>, <4 x i32> *%a1, align 16
4332  %3 = sext <4 x i32> %2 to <4 x i64>
4333  %4 = add <4 x i64> %1, %3
4334  ret <4 x i64> %4
4335}
4336
4337define <8 x i32> @test_pmovsxwd(<8 x i16> %a0, <8 x i16> *%a1) {
4338; GENERIC-LABEL: test_pmovsxwd:
4339; GENERIC:       # %bb.0:
4340; GENERIC-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00]
4341; GENERIC-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [8:1.00]
4342; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4343; GENERIC-NEXT:    retq # sched: [1:1.00]
4344;
4345; HASWELL-LABEL: test_pmovsxwd:
4346; HASWELL:       # %bb.0:
4347; HASWELL-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
4348; HASWELL-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
4349; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4350; HASWELL-NEXT:    retq # sched: [7:1.00]
4351;
4352; BROADWELL-LABEL: test_pmovsxwd:
4353; BROADWELL:       # %bb.0:
4354; BROADWELL-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
4355; BROADWELL-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [8:1.00]
4356; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4357; BROADWELL-NEXT:    retq # sched: [7:1.00]
4358;
4359; SKYLAKE-LABEL: test_pmovsxwd:
4360; SKYLAKE:       # %bb.0:
4361; SKYLAKE-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
4362; SKYLAKE-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
4363; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4364; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4365;
4366; SKX-LABEL: test_pmovsxwd:
4367; SKX:       # %bb.0:
4368; SKX-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
4369; SKX-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
4370; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4371; SKX-NEXT:    retq # sched: [7:1.00]
4372;
4373; ZNVER1-LABEL: test_pmovsxwd:
4374; ZNVER1:       # %bb.0:
4375; ZNVER1-NEXT:    vpmovsxwd (%rdi), %ymm1 # sched: [8:0.50]
4376; ZNVER1-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [1:0.50]
4377; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4378; ZNVER1-NEXT:    retq # sched: [1:0.50]
4379  %1 = sext <8 x i16> %a0 to <8 x i32>
4380  %2 = load <8 x i16>, <8 x i16> *%a1, align 16
4381  %3 = sext <8 x i16> %2 to <8 x i32>
4382  %4 = add <8 x i32> %1, %3
4383  ret <8 x i32> %4
4384}
4385
4386define <4 x i64> @test_pmovsxwq(<8 x i16> %a0, <8 x i16> *%a1) {
4387; GENERIC-LABEL: test_pmovsxwq:
4388; GENERIC:       # %bb.0:
4389; GENERIC-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [1:1.00]
4390; GENERIC-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
4391; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4392; GENERIC-NEXT:    retq # sched: [1:1.00]
4393;
4394; HASWELL-LABEL: test_pmovsxwq:
4395; HASWELL:       # %bb.0:
4396; HASWELL-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
4397; HASWELL-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
4398; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4399; HASWELL-NEXT:    retq # sched: [7:1.00]
4400;
4401; BROADWELL-LABEL: test_pmovsxwq:
4402; BROADWELL:       # %bb.0:
4403; BROADWELL-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
4404; BROADWELL-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
4405; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4406; BROADWELL-NEXT:    retq # sched: [7:1.00]
4407;
4408; SKYLAKE-LABEL: test_pmovsxwq:
4409; SKYLAKE:       # %bb.0:
4410; SKYLAKE-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
4411; SKYLAKE-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
4412; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4413; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4414;
4415; SKX-LABEL: test_pmovsxwq:
4416; SKX:       # %bb.0:
4417; SKX-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
4418; SKX-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
4419; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4420; SKX-NEXT:    retq # sched: [7:1.00]
4421;
4422; ZNVER1-LABEL: test_pmovsxwq:
4423; ZNVER1:       # %bb.0:
4424; ZNVER1-NEXT:    vpmovsxwq (%rdi), %ymm1 # sched: [8:0.50]
4425; ZNVER1-NEXT:    vpmovsxwq %xmm0, %ymm0 # sched: [1:0.50]
4426; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4427; ZNVER1-NEXT:    retq # sched: [1:0.50]
4428  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4429  %2 = sext <4 x i16> %1 to <4 x i64>
4430  %3 = load <8 x i16>, <8 x i16> *%a1, align 16
4431  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4432  %5 = sext <4 x i16> %4 to <4 x i64>
4433  %6 = add <4 x i64> %2, %5
4434  ret <4 x i64> %6
4435}
4436
4437define <8 x i32> @test_pmovzxbd(<16 x i8> %a0, <16 x i8> *%a1) {
4438; GENERIC-LABEL: test_pmovzxbd:
4439; GENERIC:       # %bb.0:
4440; GENERIC-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00]
4441; GENERIC-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:1.00]
4442; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4443; GENERIC-NEXT:    retq # sched: [1:1.00]
4444;
4445; HASWELL-LABEL: test_pmovzxbd:
4446; HASWELL:       # %bb.0:
4447; HASWELL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
4448; HASWELL-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
4449; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4450; HASWELL-NEXT:    retq # sched: [7:1.00]
4451;
4452; BROADWELL-LABEL: test_pmovzxbd:
4453; BROADWELL:       # %bb.0:
4454; BROADWELL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
4455; BROADWELL-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [9:1.00]
4456; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4457; BROADWELL-NEXT:    retq # sched: [7:1.00]
4458;
4459; SKYLAKE-LABEL: test_pmovzxbd:
4460; SKYLAKE:       # %bb.0:
4461; SKYLAKE-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
4462; SKYLAKE-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
4463; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4464; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4465;
4466; SKX-LABEL: test_pmovzxbd:
4467; SKX:       # %bb.0:
4468; SKX-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
4469; SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
4470; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4471; SKX-NEXT:    retq # sched: [7:1.00]
4472;
4473; ZNVER1-LABEL: test_pmovzxbd:
4474; ZNVER1:       # %bb.0:
4475; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:0.50]
4476; ZNVER1-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:0.50]
4477; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4478; ZNVER1-NEXT:    retq # sched: [1:0.50]
4479  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4480  %2 = zext <8 x i8> %1 to <8 x i32>
4481  %3 = load <16 x i8>, <16 x i8> *%a1, align 16
4482  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4483  %5 = zext <8 x i8> %4 to <8 x i32>
4484  %6 = add <8 x i32> %2, %5
4485  ret <8 x i32> %6
4486}
4487
4488define <4 x i64> @test_pmovzxbq(<16 x i8> %a0, <16 x i8> *%a1) {
4489; GENERIC-LABEL: test_pmovzxbq:
4490; GENERIC:       # %bb.0:
4491; GENERIC-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
4492; GENERIC-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [8:1.00]
4493; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4494; GENERIC-NEXT:    retq # sched: [1:1.00]
4495;
4496; HASWELL-LABEL: test_pmovzxbq:
4497; HASWELL:       # %bb.0:
4498; HASWELL-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
4499; HASWELL-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
4500; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4501; HASWELL-NEXT:    retq # sched: [7:1.00]
4502;
4503; BROADWELL-LABEL: test_pmovzxbq:
4504; BROADWELL:       # %bb.0:
4505; BROADWELL-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
4506; BROADWELL-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00]
4507; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4508; BROADWELL-NEXT:    retq # sched: [7:1.00]
4509;
4510; SKYLAKE-LABEL: test_pmovzxbq:
4511; SKYLAKE:       # %bb.0:
4512; SKYLAKE-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
4513; SKYLAKE-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
4514; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4515; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4516;
4517; SKX-LABEL: test_pmovzxbq:
4518; SKX:       # %bb.0:
4519; SKX-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
4520; SKX-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
4521; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4522; SKX-NEXT:    retq # sched: [7:1.00]
4523;
4524; ZNVER1-LABEL: test_pmovzxbq:
4525; ZNVER1:       # %bb.0:
4526; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50]
4527; ZNVER1-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
4528; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4529; ZNVER1-NEXT:    retq # sched: [1:0.50]
4530  %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4531  %2 = zext <4 x i8> %1 to <4 x i64>
4532  %3 = load <16 x i8>, <16 x i8> *%a1, align 16
4533  %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4534  %5 = zext <4 x i8> %4 to <4 x i64>
4535  %6 = add <4 x i64> %2, %5
4536  ret <4 x i64> %6
4537}
4538
4539define <16 x i16> @test_pmovzxbw(<16 x i8> %a0, <16 x i8> *%a1) {
4540; GENERIC-LABEL: test_pmovzxbw:
4541; GENERIC:       # %bb.0:
4542; GENERIC-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
4543; GENERIC-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:1.00]
4544; GENERIC-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4545; GENERIC-NEXT:    retq # sched: [1:1.00]
4546;
4547; HASWELL-LABEL: test_pmovzxbw:
4548; HASWELL:       # %bb.0:
4549; HASWELL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
4550; HASWELL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
4551; HASWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4552; HASWELL-NEXT:    retq # sched: [7:1.00]
4553;
4554; BROADWELL-LABEL: test_pmovzxbw:
4555; BROADWELL:       # %bb.0:
4556; BROADWELL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
4557; BROADWELL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [9:1.00]
4558; BROADWELL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4559; BROADWELL-NEXT:    retq # sched: [7:1.00]
4560;
4561; SKYLAKE-LABEL: test_pmovzxbw:
4562; SKYLAKE:       # %bb.0:
4563; SKYLAKE-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
4564; SKYLAKE-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
4565; SKYLAKE-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4566; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4567;
4568; SKX-LABEL: test_pmovzxbw:
4569; SKX:       # %bb.0:
4570; SKX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
4571; SKX-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
4572; SKX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4573; SKX-NEXT:    retq # sched: [7:1.00]
4574;
4575; ZNVER1-LABEL: test_pmovzxbw:
4576; ZNVER1:       # %bb.0:
4577; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:0.50]
4578; ZNVER1-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:0.50]
4579; ZNVER1-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4580; ZNVER1-NEXT:    retq # sched: [1:0.50]
4581  %1 = zext <16 x i8> %a0 to <16 x i16>
4582  %2 = load <16 x i8>, <16 x i8> *%a1, align 16
4583  %3 = zext <16 x i8> %2 to <16 x i16>
4584  %4 = add <16 x i16> %1, %3
4585  ret <16 x i16> %4
4586}
4587
4588define <4 x i64> @test_pmovzxdq(<4 x i32> %a0, <4 x i32> *%a1) {
4589; GENERIC-LABEL: test_pmovzxdq:
4590; GENERIC:       # %bb.0:
4591; GENERIC-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
4592; GENERIC-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:1.00]
4593; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4594; GENERIC-NEXT:    retq # sched: [1:1.00]
4595;
4596; HASWELL-LABEL: test_pmovzxdq:
4597; HASWELL:       # %bb.0:
4598; HASWELL-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
4599; HASWELL-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
4600; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4601; HASWELL-NEXT:    retq # sched: [7:1.00]
4602;
4603; BROADWELL-LABEL: test_pmovzxdq:
4604; BROADWELL:       # %bb.0:
4605; BROADWELL-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
4606; BROADWELL-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00]
4607; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4608; BROADWELL-NEXT:    retq # sched: [7:1.00]
4609;
4610; SKYLAKE-LABEL: test_pmovzxdq:
4611; SKYLAKE:       # %bb.0:
4612; SKYLAKE-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
4613; SKYLAKE-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
4614; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4615; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4616;
4617; SKX-LABEL: test_pmovzxdq:
4618; SKX:       # %bb.0:
4619; SKX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
4620; SKX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
4621; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4622; SKX-NEXT:    retq # sched: [7:1.00]
4623;
4624; ZNVER1-LABEL: test_pmovzxdq:
4625; ZNVER1:       # %bb.0:
4626; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50]
4627; ZNVER1-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
4628; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4629; ZNVER1-NEXT:    retq # sched: [1:0.50]
4630  %1 = zext <4 x i32> %a0 to <4 x i64>
4631  %2 = load <4 x i32>, <4 x i32> *%a1, align 16
4632  %3 = zext <4 x i32> %2 to <4 x i64>
4633  %4 = add <4 x i64> %1, %3
4634  ret <4 x i64> %4
4635}
4636
4637define <8 x i32> @test_pmovzxwd(<8 x i16> %a0, <8 x i16> *%a1) {
4638; GENERIC-LABEL: test_pmovzxwd:
4639; GENERIC:       # %bb.0:
4640; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
4641; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
4642; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4643; GENERIC-NEXT:    retq # sched: [1:1.00]
4644;
4645; HASWELL-LABEL: test_pmovzxwd:
4646; HASWELL:       # %bb.0:
4647; HASWELL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
4648; HASWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
4649; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4650; HASWELL-NEXT:    retq # sched: [7:1.00]
4651;
4652; BROADWELL-LABEL: test_pmovzxwd:
4653; BROADWELL:       # %bb.0:
4654; BROADWELL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
4655; BROADWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
4656; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4657; BROADWELL-NEXT:    retq # sched: [7:1.00]
4658;
4659; SKYLAKE-LABEL: test_pmovzxwd:
4660; SKYLAKE:       # %bb.0:
4661; SKYLAKE-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
4662; SKYLAKE-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
4663; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4664; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4665;
4666; SKX-LABEL: test_pmovzxwd:
4667; SKX:       # %bb.0:
4668; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
4669; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
4670; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4671; SKX-NEXT:    retq # sched: [7:1.00]
4672;
4673; ZNVER1-LABEL: test_pmovzxwd:
4674; ZNVER1:       # %bb.0:
4675; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50]
4676; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
4677; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4678; ZNVER1-NEXT:    retq # sched: [1:0.50]
4679  %1 = zext <8 x i16> %a0 to <8 x i32>
4680  %2 = load <8 x i16>, <8 x i16> *%a1, align 16
4681  %3 = zext <8 x i16> %2 to <8 x i32>
4682  %4 = add <8 x i32> %1, %3
4683  ret <8 x i32> %4
4684}
4685
4686define <4 x i64> @test_pmovzxwq(<8 x i16> %a0, <8 x i16> *%a1) {
4687; GENERIC-LABEL: test_pmovzxwq:
4688; GENERIC:       # %bb.0:
4689; GENERIC-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
4690; GENERIC-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:1.00]
4691; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4692; GENERIC-NEXT:    retq # sched: [1:1.00]
4693;
4694; HASWELL-LABEL: test_pmovzxwq:
4695; HASWELL:       # %bb.0:
4696; HASWELL-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
4697; HASWELL-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
4698; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4699; HASWELL-NEXT:    retq # sched: [7:1.00]
4700;
4701; BROADWELL-LABEL: test_pmovzxwq:
4702; BROADWELL:       # %bb.0:
4703; BROADWELL-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
4704; BROADWELL-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00]
4705; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
4706; BROADWELL-NEXT:    retq # sched: [7:1.00]
4707;
4708; SKYLAKE-LABEL: test_pmovzxwq:
4709; SKYLAKE:       # %bb.0:
4710; SKYLAKE-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
4711; SKYLAKE-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
4712; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4713; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4714;
4715; SKX-LABEL: test_pmovzxwq:
4716; SKX:       # %bb.0:
4717; SKX-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
4718; SKX-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
4719; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
4720; SKX-NEXT:    retq # sched: [7:1.00]
4721;
4722; ZNVER1-LABEL: test_pmovzxwq:
4723; ZNVER1:       # %bb.0:
4724; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50]
4725; ZNVER1-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
4726; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
4727; ZNVER1-NEXT:    retq # sched: [1:0.50]
4728  %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4729  %2 = zext <4 x i16> %1 to <4 x i64>
4730  %3 = load <8 x i16>, <8 x i16> *%a1, align 16
4731  %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4732  %5 = zext <4 x i16> %4 to <4 x i64>
4733  %6 = add <4 x i64> %2, %5
4734  ret <4 x i64> %6
4735}
4736
4737define <4 x i64> @test_pmuldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
4738; GENERIC-LABEL: test_pmuldq:
4739; GENERIC:       # %bb.0:
4740; GENERIC-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4741; GENERIC-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4742; GENERIC-NEXT:    retq # sched: [1:1.00]
4743;
4744; HASWELL-LABEL: test_pmuldq:
4745; HASWELL:       # %bb.0:
4746; HASWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4747; HASWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4748; HASWELL-NEXT:    retq # sched: [7:1.00]
4749;
4750; BROADWELL-LABEL: test_pmuldq:
4751; BROADWELL:       # %bb.0:
4752; BROADWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4753; BROADWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4754; BROADWELL-NEXT:    retq # sched: [7:1.00]
4755;
4756; SKYLAKE-LABEL: test_pmuldq:
4757; SKYLAKE:       # %bb.0:
4758; SKYLAKE-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4759; SKYLAKE-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4760; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4761;
4762; SKX-LABEL: test_pmuldq:
4763; SKX:       # %bb.0:
4764; SKX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4765; SKX-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4766; SKX-NEXT:    retq # sched: [7:1.00]
4767;
4768; ZNVER1-LABEL: test_pmuldq:
4769; ZNVER1:       # %bb.0:
4770; ZNVER1-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
4771; ZNVER1-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4772; ZNVER1-NEXT:    retq # sched: [1:0.50]
4773  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1)
4774  %2 = bitcast <4 x i64> %1 to <8 x i32>
4775  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
4776  %4 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %2, <8 x i32> %3)
4777  ret <4 x i64> %4
4778}
4779declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
4780
4781define <16 x i16> @test_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
4782; GENERIC-LABEL: test_pmulhrsw:
4783; GENERIC:       # %bb.0:
4784; GENERIC-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4785; GENERIC-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4786; GENERIC-NEXT:    retq # sched: [1:1.00]
4787;
4788; HASWELL-LABEL: test_pmulhrsw:
4789; HASWELL:       # %bb.0:
4790; HASWELL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4791; HASWELL-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4792; HASWELL-NEXT:    retq # sched: [7:1.00]
4793;
4794; BROADWELL-LABEL: test_pmulhrsw:
4795; BROADWELL:       # %bb.0:
4796; BROADWELL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4797; BROADWELL-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4798; BROADWELL-NEXT:    retq # sched: [7:1.00]
4799;
4800; SKYLAKE-LABEL: test_pmulhrsw:
4801; SKYLAKE:       # %bb.0:
4802; SKYLAKE-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4803; SKYLAKE-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4804; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4805;
4806; SKX-LABEL: test_pmulhrsw:
4807; SKX:       # %bb.0:
4808; SKX-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4809; SKX-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4810; SKX-NEXT:    retq # sched: [7:1.00]
4811;
4812; ZNVER1-LABEL: test_pmulhrsw:
4813; ZNVER1:       # %bb.0:
4814; ZNVER1-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
4815; ZNVER1-NEXT:    vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4816; ZNVER1-NEXT:    retq # sched: [1:0.50]
4817  %1 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1)
4818  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
4819  %3 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %1, <16 x i16> %2)
4820  ret <16 x i16> %3
4821}
4822declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
4823
4824define <16 x i16> @test_pmulhuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
4825; GENERIC-LABEL: test_pmulhuw:
4826; GENERIC:       # %bb.0:
4827; GENERIC-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4828; GENERIC-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4829; GENERIC-NEXT:    retq # sched: [1:1.00]
4830;
4831; HASWELL-LABEL: test_pmulhuw:
4832; HASWELL:       # %bb.0:
4833; HASWELL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4834; HASWELL-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4835; HASWELL-NEXT:    retq # sched: [7:1.00]
4836;
4837; BROADWELL-LABEL: test_pmulhuw:
4838; BROADWELL:       # %bb.0:
4839; BROADWELL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4840; BROADWELL-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4841; BROADWELL-NEXT:    retq # sched: [7:1.00]
4842;
4843; SKYLAKE-LABEL: test_pmulhuw:
4844; SKYLAKE:       # %bb.0:
4845; SKYLAKE-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4846; SKYLAKE-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4847; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4848;
4849; SKX-LABEL: test_pmulhuw:
4850; SKX:       # %bb.0:
4851; SKX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4852; SKX-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4853; SKX-NEXT:    retq # sched: [7:1.00]
4854;
4855; ZNVER1-LABEL: test_pmulhuw:
4856; ZNVER1:       # %bb.0:
4857; ZNVER1-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
4858; ZNVER1-NEXT:    vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4859; ZNVER1-NEXT:    retq # sched: [1:0.50]
4860  %1 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1)
4861  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
4862  %3 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %1, <16 x i16> %2)
4863  ret <16 x i16> %3
4864}
4865declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
4866
4867define <16 x i16> @test_pmulhw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
4868; GENERIC-LABEL: test_pmulhw:
4869; GENERIC:       # %bb.0:
4870; GENERIC-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4871; GENERIC-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4872; GENERIC-NEXT:    retq # sched: [1:1.00]
4873;
4874; HASWELL-LABEL: test_pmulhw:
4875; HASWELL:       # %bb.0:
4876; HASWELL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4877; HASWELL-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4878; HASWELL-NEXT:    retq # sched: [7:1.00]
4879;
4880; BROADWELL-LABEL: test_pmulhw:
4881; BROADWELL:       # %bb.0:
4882; BROADWELL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4883; BROADWELL-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4884; BROADWELL-NEXT:    retq # sched: [7:1.00]
4885;
4886; SKYLAKE-LABEL: test_pmulhw:
4887; SKYLAKE:       # %bb.0:
4888; SKYLAKE-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4889; SKYLAKE-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4890; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4891;
4892; SKX-LABEL: test_pmulhw:
4893; SKX:       # %bb.0:
4894; SKX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4895; SKX-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4896; SKX-NEXT:    retq # sched: [7:1.00]
4897;
4898; ZNVER1-LABEL: test_pmulhw:
4899; ZNVER1:       # %bb.0:
4900; ZNVER1-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
4901; ZNVER1-NEXT:    vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4902; ZNVER1-NEXT:    retq # sched: [1:0.50]
4903  %1 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1)
4904  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
4905  %3 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %1, <16 x i16> %2)
4906  ret <16 x i16> %3
4907}
4908declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
4909
4910define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
4911; GENERIC-LABEL: test_pmulld:
4912; GENERIC:       # %bb.0:
4913; GENERIC-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4914; GENERIC-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4915; GENERIC-NEXT:    retq # sched: [1:1.00]
4916;
4917; HASWELL-LABEL: test_pmulld:
4918; HASWELL:       # %bb.0:
4919; HASWELL-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
4920; HASWELL-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [17:2.00]
4921; HASWELL-NEXT:    retq # sched: [7:1.00]
4922;
4923; BROADWELL-LABEL: test_pmulld:
4924; BROADWELL:       # %bb.0:
4925; BROADWELL-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
4926; BROADWELL-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
4927; BROADWELL-NEXT:    retq # sched: [7:1.00]
4928;
4929; SKYLAKE-LABEL: test_pmulld:
4930; SKYLAKE:       # %bb.0:
4931; SKYLAKE-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
4932; SKYLAKE-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [17:1.00]
4933; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4934;
4935; SKX-LABEL: test_pmulld:
4936; SKX:       # %bb.0:
4937; SKX-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
4938; SKX-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [17:1.00]
4939; SKX-NEXT:    retq # sched: [7:1.00]
4940;
4941; ZNVER1-LABEL: test_pmulld:
4942; ZNVER1:       # %bb.0:
4943; ZNVER1-NEXT:    vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
4944; ZNVER1-NEXT:    vpmulld (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
4945; ZNVER1-NEXT:    retq # sched: [1:0.50]
4946  %1 = mul <8 x i32> %a0, %a1
4947  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
4948  %3 = mul <8 x i32> %1, %2
4949  ret <8 x i32> %3
4950}
4951
4952define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
4953; GENERIC-LABEL: test_pmullw:
4954; GENERIC:       # %bb.0:
4955; GENERIC-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4956; GENERIC-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4957; GENERIC-NEXT:    retq # sched: [1:1.00]
4958;
4959; HASWELL-LABEL: test_pmullw:
4960; HASWELL:       # %bb.0:
4961; HASWELL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4962; HASWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4963; HASWELL-NEXT:    retq # sched: [7:1.00]
4964;
4965; BROADWELL-LABEL: test_pmullw:
4966; BROADWELL:       # %bb.0:
4967; BROADWELL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4968; BROADWELL-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4969; BROADWELL-NEXT:    retq # sched: [7:1.00]
4970;
4971; SKYLAKE-LABEL: test_pmullw:
4972; SKYLAKE:       # %bb.0:
4973; SKYLAKE-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4974; SKYLAKE-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4975; SKYLAKE-NEXT:    retq # sched: [7:1.00]
4976;
4977; SKX-LABEL: test_pmullw:
4978; SKX:       # %bb.0:
4979; SKX-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
4980; SKX-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
4981; SKX-NEXT:    retq # sched: [7:1.00]
4982;
4983; ZNVER1-LABEL: test_pmullw:
4984; ZNVER1:       # %bb.0:
4985; ZNVER1-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
4986; ZNVER1-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
4987; ZNVER1-NEXT:    retq # sched: [1:0.50]
4988  %1 = mul <16 x i16> %a0, %a1
4989  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
4990  %3 = mul <16 x i16> %1, %2
4991  ret <16 x i16> %3
4992}
4993
4994define <4 x i64> @test_pmuludq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
4995; GENERIC-LABEL: test_pmuludq:
4996; GENERIC:       # %bb.0:
4997; GENERIC-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
4998; GENERIC-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
4999; GENERIC-NEXT:    retq # sched: [1:1.00]
5000;
5001; HASWELL-LABEL: test_pmuludq:
5002; HASWELL:       # %bb.0:
5003; HASWELL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
5004; HASWELL-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
5005; HASWELL-NEXT:    retq # sched: [7:1.00]
5006;
5007; BROADWELL-LABEL: test_pmuludq:
5008; BROADWELL:       # %bb.0:
5009; BROADWELL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
5010; BROADWELL-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5011; BROADWELL-NEXT:    retq # sched: [7:1.00]
5012;
5013; SKYLAKE-LABEL: test_pmuludq:
5014; SKYLAKE:       # %bb.0:
5015; SKYLAKE-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
5016; SKYLAKE-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
5017; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5018;
5019; SKX-LABEL: test_pmuludq:
5020; SKX:       # %bb.0:
5021; SKX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
5022; SKX-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
5023; SKX-NEXT:    retq # sched: [7:1.00]
5024;
5025; ZNVER1-LABEL: test_pmuludq:
5026; ZNVER1:       # %bb.0:
5027; ZNVER1-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
5028; ZNVER1-NEXT:    vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5029; ZNVER1-NEXT:    retq # sched: [1:0.50]
5030  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1)
5031  %2 = bitcast <4 x i64> %1 to <8 x i32>
5032  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
5033  %4 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %2, <8 x i32> %3)
5034  ret <4 x i64> %4
5035}
5036declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
5037
5038define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
5039; GENERIC-LABEL: test_por:
5040; GENERIC:       # %bb.0:
5041; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5042; GENERIC-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5043; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5044; GENERIC-NEXT:    retq # sched: [1:1.00]
5045;
5046; HASWELL-LABEL: test_por:
5047; HASWELL:       # %bb.0:
5048; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5049; HASWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5050; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5051; HASWELL-NEXT:    retq # sched: [7:1.00]
5052;
5053; BROADWELL-LABEL: test_por:
5054; BROADWELL:       # %bb.0:
5055; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5056; BROADWELL-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
5057; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5058; BROADWELL-NEXT:    retq # sched: [7:1.00]
5059;
5060; SKYLAKE-LABEL: test_por:
5061; SKYLAKE:       # %bb.0:
5062; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5063; SKYLAKE-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5064; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5065; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5066;
5067; SKX-LABEL: test_por:
5068; SKX:       # %bb.0:
5069; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5070; SKX-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5071; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5072; SKX-NEXT:    retq # sched: [7:1.00]
5073;
5074; ZNVER1-LABEL: test_por:
5075; ZNVER1:       # %bb.0:
5076; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5077; ZNVER1-NEXT:    vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5078; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5079; ZNVER1-NEXT:    retq # sched: [1:0.50]
5080  %1 = or <4 x i64> %a0, %a1
5081  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
5082  %3 = or <4 x i64> %1, %2
5083  %4 = add <4 x i64> %3, %a1
5084  ret <4 x i64> %4
5085}
5086
5087define <4 x i64> @test_psadbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
5088; GENERIC-LABEL: test_psadbw:
5089; GENERIC:       # %bb.0:
5090; GENERIC-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
5091; GENERIC-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
5092; GENERIC-NEXT:    retq # sched: [1:1.00]
5093;
5094; HASWELL-LABEL: test_psadbw:
5095; HASWELL:       # %bb.0:
5096; HASWELL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
5097; HASWELL-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
5098; HASWELL-NEXT:    retq # sched: [7:1.00]
5099;
5100; BROADWELL-LABEL: test_psadbw:
5101; BROADWELL:       # %bb.0:
5102; BROADWELL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
5103; BROADWELL-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5104; BROADWELL-NEXT:    retq # sched: [7:1.00]
5105;
5106; SKYLAKE-LABEL: test_psadbw:
5107; SKYLAKE:       # %bb.0:
5108; SKYLAKE-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
5109; SKYLAKE-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
5110; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5111;
5112; SKX-LABEL: test_psadbw:
5113; SKX:       # %bb.0:
5114; SKX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
5115; SKX-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
5116; SKX-NEXT:    retq # sched: [7:1.00]
5117;
5118; ZNVER1-LABEL: test_psadbw:
5119; ZNVER1:       # %bb.0:
5120; ZNVER1-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
5121; ZNVER1-NEXT:    vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
5122; ZNVER1-NEXT:    retq # sched: [1:0.50]
5123  %1 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
5124  %2 = bitcast <4 x i64> %1 to <32 x i8>
5125  %3 = load <32 x i8>, <32 x i8> *%a2, align 32
5126  %4 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %2, <32 x i8> %3)
5127  ret <4 x i64> %4
5128}
5129declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
5130
5131define <32 x i8> @test_pshufb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
5132; GENERIC-LABEL: test_pshufb:
5133; GENERIC:       # %bb.0:
5134; GENERIC-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5135; GENERIC-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5136; GENERIC-NEXT:    retq # sched: [1:1.00]
5137;
5138; HASWELL-LABEL: test_pshufb:
5139; HASWELL:       # %bb.0:
5140; HASWELL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5141; HASWELL-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5142; HASWELL-NEXT:    retq # sched: [7:1.00]
5143;
5144; BROADWELL-LABEL: test_pshufb:
5145; BROADWELL:       # %bb.0:
5146; BROADWELL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5147; BROADWELL-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
5148; BROADWELL-NEXT:    retq # sched: [7:1.00]
5149;
5150; SKYLAKE-LABEL: test_pshufb:
5151; SKYLAKE:       # %bb.0:
5152; SKYLAKE-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5153; SKYLAKE-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5154; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5155;
5156; SKX-LABEL: test_pshufb:
5157; SKX:       # %bb.0:
5158; SKX-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5159; SKX-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5160; SKX-NEXT:    retq # sched: [7:1.00]
5161;
5162; ZNVER1-LABEL: test_pshufb:
5163; ZNVER1:       # %bb.0:
5164; ZNVER1-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5165; ZNVER1-NEXT:    vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5166; ZNVER1-NEXT:    retq # sched: [1:0.50]
5167  %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
5168  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
5169  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> %2)
5170  ret <32 x i8> %3
5171}
5172declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
5173
5174define <8 x i32> @test_pshufd(<8 x i32> %a0, <8 x i32> *%a1) {
5175; GENERIC-LABEL: test_pshufd:
5176; GENERIC:       # %bb.0:
5177; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
5178; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
5179; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5180; GENERIC-NEXT:    retq # sched: [1:1.00]
5181;
5182; HASWELL-LABEL: test_pshufd:
5183; HASWELL:       # %bb.0:
5184; HASWELL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
5185; HASWELL-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
5186; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5187; HASWELL-NEXT:    retq # sched: [7:1.00]
5188;
5189; BROADWELL-LABEL: test_pshufd:
5190; BROADWELL:       # %bb.0:
5191; BROADWELL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
5192; BROADWELL-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [7:1.00]
5193; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5194; BROADWELL-NEXT:    retq # sched: [7:1.00]
5195;
5196; SKYLAKE-LABEL: test_pshufd:
5197; SKYLAKE:       # %bb.0:
5198; SKYLAKE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
5199; SKYLAKE-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
5200; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5201; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5202;
5203; SKX-LABEL: test_pshufd:
5204; SKX:       # %bb.0:
5205; SKX-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
5206; SKX-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
5207; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5208; SKX-NEXT:    retq # sched: [7:1.00]
5209;
5210; ZNVER1-LABEL: test_pshufd:
5211; ZNVER1:       # %bb.0:
5212; ZNVER1-NEXT:    vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:0.50]
5213; ZNVER1-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.25]
5214; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5215; ZNVER1-NEXT:    retq # sched: [1:0.50]
5216  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
5217  %2 = load <8 x i32>, <8 x i32> *%a1, align 32
5218  %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
5219  %4 = add <8 x i32> %1, %3
5220  ret <8 x i32> %4
5221}
5222
5223define <16 x i16> @test_pshufhw(<16 x i16> %a0, <16 x i16> *%a1) {
5224; GENERIC-LABEL: test_pshufhw:
5225; GENERIC:       # %bb.0:
5226; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
5227; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
5228; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5229; GENERIC-NEXT:    retq # sched: [1:1.00]
5230;
5231; HASWELL-LABEL: test_pshufhw:
5232; HASWELL:       # %bb.0:
5233; HASWELL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
5234; HASWELL-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
5235; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5236; HASWELL-NEXT:    retq # sched: [7:1.00]
5237;
5238; BROADWELL-LABEL: test_pshufhw:
5239; BROADWELL:       # %bb.0:
5240; BROADWELL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
5241; BROADWELL-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [7:1.00]
5242; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5243; BROADWELL-NEXT:    retq # sched: [7:1.00]
5244;
5245; SKYLAKE-LABEL: test_pshufhw:
5246; SKYLAKE:       # %bb.0:
5247; SKYLAKE-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
5248; SKYLAKE-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
5249; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5250; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5251;
5252; SKX-LABEL: test_pshufhw:
5253; SKX:       # %bb.0:
5254; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
5255; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
5256; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5257; SKX-NEXT:    retq # sched: [7:1.00]
5258;
5259; ZNVER1-LABEL: test_pshufhw:
5260; ZNVER1:       # %bb.0:
5261; ZNVER1-NEXT:    vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:0.50]
5262; ZNVER1-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:0.25]
5263; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5264; ZNVER1-NEXT:    retq # sched: [1:0.50]
5265  %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
5266  %2 = load <16 x i16>, <16 x i16> *%a1, align 32
5267  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14>
5268  %4 = or <16 x i16> %1, %3
5269  ret <16 x i16> %4
5270}
5271
5272define <16 x i16> @test_pshuflw(<16 x i16> %a0, <16 x i16> *%a1) {
5273; GENERIC-LABEL: test_pshuflw:
5274; GENERIC:       # %bb.0:
5275; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
5276; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
5277; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5278; GENERIC-NEXT:    retq # sched: [1:1.00]
5279;
5280; HASWELL-LABEL: test_pshuflw:
5281; HASWELL:       # %bb.0:
5282; HASWELL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
5283; HASWELL-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
5284; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5285; HASWELL-NEXT:    retq # sched: [7:1.00]
5286;
5287; BROADWELL-LABEL: test_pshuflw:
5288; BROADWELL:       # %bb.0:
5289; BROADWELL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
5290; BROADWELL-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [7:1.00]
5291; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5292; BROADWELL-NEXT:    retq # sched: [7:1.00]
5293;
5294; SKYLAKE-LABEL: test_pshuflw:
5295; SKYLAKE:       # %bb.0:
5296; SKYLAKE-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
5297; SKYLAKE-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
5298; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5299; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5300;
5301; SKX-LABEL: test_pshuflw:
5302; SKX:       # %bb.0:
5303; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
5304; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
5305; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
5306; SKX-NEXT:    retq # sched: [7:1.00]
5307;
5308; ZNVER1-LABEL: test_pshuflw:
5309; ZNVER1:       # %bb.0:
5310; ZNVER1-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:0.50]
5311; ZNVER1-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:0.25]
5312; ZNVER1-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5313; ZNVER1-NEXT:    retq # sched: [1:0.50]
5314  %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
5315  %2 = load <16 x i16>, <16 x i16> *%a1, align 32
5316  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
5317  %4 = or <16 x i16> %1, %3
5318  ret <16 x i16> %4
5319}
5320
5321define <32 x i8> @test_psignb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
5322; GENERIC-LABEL: test_psignb:
5323; GENERIC:       # %bb.0:
5324; GENERIC-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5325; GENERIC-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5326; GENERIC-NEXT:    retq # sched: [1:1.00]
5327;
5328; HASWELL-LABEL: test_psignb:
5329; HASWELL:       # %bb.0:
5330; HASWELL-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5331; HASWELL-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5332; HASWELL-NEXT:    retq # sched: [7:1.00]
5333;
5334; BROADWELL-LABEL: test_psignb:
5335; BROADWELL:       # %bb.0:
5336; BROADWELL-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5337; BROADWELL-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
5338; BROADWELL-NEXT:    retq # sched: [7:1.00]
5339;
5340; SKYLAKE-LABEL: test_psignb:
5341; SKYLAKE:       # %bb.0:
5342; SKYLAKE-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5343; SKYLAKE-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5344; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5345;
5346; SKX-LABEL: test_psignb:
5347; SKX:       # %bb.0:
5348; SKX-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5349; SKX-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5350; SKX-NEXT:    retq # sched: [7:1.00]
5351;
5352; ZNVER1-LABEL: test_psignb:
5353; ZNVER1:       # %bb.0:
5354; ZNVER1-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5355; ZNVER1-NEXT:    vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5356; ZNVER1-NEXT:    retq # sched: [1:0.50]
5357  %1 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1)
5358  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
5359  %3 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %1, <32 x i8> %2)
5360  ret <32 x i8> %3
5361}
5362declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
5363
5364define <8 x i32> @test_psignd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
5365; GENERIC-LABEL: test_psignd:
5366; GENERIC:       # %bb.0:
5367; GENERIC-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5368; GENERIC-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5369; GENERIC-NEXT:    retq # sched: [1:1.00]
5370;
5371; HASWELL-LABEL: test_psignd:
5372; HASWELL:       # %bb.0:
5373; HASWELL-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5374; HASWELL-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5375; HASWELL-NEXT:    retq # sched: [7:1.00]
5376;
5377; BROADWELL-LABEL: test_psignd:
5378; BROADWELL:       # %bb.0:
5379; BROADWELL-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5380; BROADWELL-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
5381; BROADWELL-NEXT:    retq # sched: [7:1.00]
5382;
5383; SKYLAKE-LABEL: test_psignd:
5384; SKYLAKE:       # %bb.0:
5385; SKYLAKE-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5386; SKYLAKE-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5387; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5388;
5389; SKX-LABEL: test_psignd:
5390; SKX:       # %bb.0:
5391; SKX-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5392; SKX-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5393; SKX-NEXT:    retq # sched: [7:1.00]
5394;
5395; ZNVER1-LABEL: test_psignd:
5396; ZNVER1:       # %bb.0:
5397; ZNVER1-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5398; ZNVER1-NEXT:    vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5399; ZNVER1-NEXT:    retq # sched: [1:0.50]
5400  %1 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1)
5401  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
5402  %3 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %1, <8 x i32> %2)
5403  ret <8 x i32> %3
5404}
5405declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
5406
5407define <16 x i16> @test_psignw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
5408; GENERIC-LABEL: test_psignw:
5409; GENERIC:       # %bb.0:
5410; GENERIC-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5411; GENERIC-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5412; GENERIC-NEXT:    retq # sched: [1:1.00]
5413;
5414; HASWELL-LABEL: test_psignw:
5415; HASWELL:       # %bb.0:
5416; HASWELL-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5417; HASWELL-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5418; HASWELL-NEXT:    retq # sched: [7:1.00]
5419;
5420; BROADWELL-LABEL: test_psignw:
5421; BROADWELL:       # %bb.0:
5422; BROADWELL-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5423; BROADWELL-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
5424; BROADWELL-NEXT:    retq # sched: [7:1.00]
5425;
5426; SKYLAKE-LABEL: test_psignw:
5427; SKYLAKE:       # %bb.0:
5428; SKYLAKE-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5429; SKYLAKE-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5430; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5431;
5432; SKX-LABEL: test_psignw:
5433; SKX:       # %bb.0:
5434; SKX-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5435; SKX-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5436; SKX-NEXT:    retq # sched: [7:1.00]
5437;
5438; ZNVER1-LABEL: test_psignw:
5439; ZNVER1:       # %bb.0:
5440; ZNVER1-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
5441; ZNVER1-NEXT:    vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5442; ZNVER1-NEXT:    retq # sched: [1:0.50]
5443  %1 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1)
5444  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
5445  %3 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %1, <16 x i16> %2)
5446  ret <16 x i16> %3
5447}
5448declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
5449
5450define <8 x i32> @test_pslld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
5451; GENERIC-LABEL: test_pslld:
5452; GENERIC:       # %bb.0:
5453; GENERIC-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5454; GENERIC-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5455; GENERIC-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
5456; GENERIC-NEXT:    retq # sched: [1:1.00]
5457;
5458; HASWELL-LABEL: test_pslld:
5459; HASWELL:       # %bb.0:
5460; HASWELL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5461; HASWELL-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5462; HASWELL-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
5463; HASWELL-NEXT:    retq # sched: [7:1.00]
5464;
5465; BROADWELL-LABEL: test_pslld:
5466; BROADWELL:       # %bb.0:
5467; BROADWELL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5468; BROADWELL-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
5469; BROADWELL-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
5470; BROADWELL-NEXT:    retq # sched: [7:1.00]
5471;
5472; SKYLAKE-LABEL: test_pslld:
5473; SKYLAKE:       # %bb.0:
5474; SKYLAKE-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5475; SKYLAKE-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5476; SKYLAKE-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:0.50]
5477; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5478;
5479; SKX-LABEL: test_pslld:
5480; SKX:       # %bb.0:
5481; SKX-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5482; SKX-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5483; SKX-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:0.50]
5484; SKX-NEXT:    retq # sched: [7:1.00]
5485;
5486; ZNVER1-LABEL: test_pslld:
5487; ZNVER1:       # %bb.0:
5488; ZNVER1-NEXT:    vpslld %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
5489; ZNVER1-NEXT:    vpslld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
5490; ZNVER1-NEXT:    vpslld $2, %ymm0, %ymm0 # sched: [1:0.25]
5491; ZNVER1-NEXT:    retq # sched: [1:0.50]
5492  %1 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
5493  %2 = load <4 x i32>, <4 x i32> *%a2, align 16
5494  %3 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %1, <4 x i32> %2)
5495  %4 = shl <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
5496  ret <8 x i32> %4
5497}
5498declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
5499
5500define <32 x i8> @test_pslldq(<32 x i8> %a0) {
5501; GENERIC-LABEL: test_pslldq:
5502; GENERIC:       # %bb.0:
5503; GENERIC-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
5504; GENERIC-NEXT:    retq # sched: [1:1.00]
5505;
5506; HASWELL-LABEL: test_pslldq:
5507; HASWELL:       # %bb.0:
5508; HASWELL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
5509; HASWELL-NEXT:    retq # sched: [7:1.00]
5510;
5511; BROADWELL-LABEL: test_pslldq:
5512; BROADWELL:       # %bb.0:
5513; BROADWELL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
5514; BROADWELL-NEXT:    retq # sched: [7:1.00]
5515;
5516; SKYLAKE-LABEL: test_pslldq:
5517; SKYLAKE:       # %bb.0:
5518; SKYLAKE-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
5519; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5520;
5521; SKX-LABEL: test_pslldq:
5522; SKX:       # %bb.0:
5523; SKX-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
5524; SKX-NEXT:    retq # sched: [7:1.00]
5525;
5526; ZNVER1-LABEL: test_pslldq:
5527; ZNVER1:       # %bb.0:
5528; ZNVER1-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [2:1.00]
5529; ZNVER1-NEXT:    retq # sched: [1:0.50]
5530  %1 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
5531  ret <32 x i8> %1
5532}
5533
5534define <4 x i64> @test_psllq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
5535; GENERIC-LABEL: test_psllq:
5536; GENERIC:       # %bb.0:
5537; GENERIC-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5538; GENERIC-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5539; GENERIC-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
5540; GENERIC-NEXT:    retq # sched: [1:1.00]
5541;
5542; HASWELL-LABEL: test_psllq:
5543; HASWELL:       # %bb.0:
5544; HASWELL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5545; HASWELL-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5546; HASWELL-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
5547; HASWELL-NEXT:    retq # sched: [7:1.00]
5548;
5549; BROADWELL-LABEL: test_psllq:
5550; BROADWELL:       # %bb.0:
5551; BROADWELL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5552; BROADWELL-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
5553; BROADWELL-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
5554; BROADWELL-NEXT:    retq # sched: [7:1.00]
5555;
5556; SKYLAKE-LABEL: test_psllq:
5557; SKYLAKE:       # %bb.0:
5558; SKYLAKE-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5559; SKYLAKE-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5560; SKYLAKE-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:0.50]
5561; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5562;
5563; SKX-LABEL: test_psllq:
5564; SKX:       # %bb.0:
5565; SKX-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5566; SKX-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5567; SKX-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:0.50]
5568; SKX-NEXT:    retq # sched: [7:1.00]
5569;
5570; ZNVER1-LABEL: test_psllq:
5571; ZNVER1:       # %bb.0:
5572; ZNVER1-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
5573; ZNVER1-NEXT:    vpsllq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
5574; ZNVER1-NEXT:    vpsllq $2, %ymm0, %ymm0 # sched: [1:0.25]
5575; ZNVER1-NEXT:    retq # sched: [1:0.50]
5576  %1 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
5577  %2 = load <2 x i64>, <2 x i64> *%a2, align 16
5578  %3 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %1, <2 x i64> %2)
5579  %4 = shl <4 x i64> %3, <i64 2, i64 2, i64 2, i64 2>
5580  ret <4 x i64> %4
5581}
5582declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
5583
5584define <4 x i32> @test_psllvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
5585; GENERIC-LABEL: test_psllvd:
5586; GENERIC:       # %bb.0:
5587; GENERIC-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
5588; GENERIC-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
5589; GENERIC-NEXT:    retq # sched: [1:1.00]
5590;
5591; HASWELL-LABEL: test_psllvd:
5592; HASWELL:       # %bb.0:
5593; HASWELL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
5594; HASWELL-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
5595; HASWELL-NEXT:    retq # sched: [7:1.00]
5596;
5597; BROADWELL-LABEL: test_psllvd:
5598; BROADWELL:       # %bb.0:
5599; BROADWELL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
5600; BROADWELL-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
5601; BROADWELL-NEXT:    retq # sched: [7:1.00]
5602;
5603; SKYLAKE-LABEL: test_psllvd:
5604; SKYLAKE:       # %bb.0:
5605; SKYLAKE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5606; SKYLAKE-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
5607; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5608;
5609; SKX-LABEL: test_psllvd:
5610; SKX:       # %bb.0:
5611; SKX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5612; SKX-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
5613; SKX-NEXT:    retq # sched: [7:1.00]
5614;
5615; ZNVER1-LABEL: test_psllvd:
5616; ZNVER1:       # %bb.0:
5617; ZNVER1-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5618; ZNVER1-NEXT:    vpsllvd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
5619; ZNVER1-NEXT:    retq # sched: [1:0.50]
5620  %1 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
5621  %2 = load <4 x i32>, <4 x i32> *%a2, align 16
5622  %3 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %1, <4 x i32> %2)
5623  ret <4 x i32> %3
5624}
5625declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
5626
5627define <8 x i32> @test_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
5628; GENERIC-LABEL: test_psllvd_ymm:
5629; GENERIC:       # %bb.0:
5630; GENERIC-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5631; GENERIC-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5632; GENERIC-NEXT:    retq # sched: [1:1.00]
5633;
5634; HASWELL-LABEL: test_psllvd_ymm:
5635; HASWELL:       # %bb.0:
5636; HASWELL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
5637; HASWELL-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
5638; HASWELL-NEXT:    retq # sched: [7:1.00]
5639;
5640; BROADWELL-LABEL: test_psllvd_ymm:
5641; BROADWELL:       # %bb.0:
5642; BROADWELL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
5643; BROADWELL-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
5644; BROADWELL-NEXT:    retq # sched: [7:1.00]
5645;
5646; SKYLAKE-LABEL: test_psllvd_ymm:
5647; SKYLAKE:       # %bb.0:
5648; SKYLAKE-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5649; SKYLAKE-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5650; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5651;
5652; SKX-LABEL: test_psllvd_ymm:
5653; SKX:       # %bb.0:
5654; SKX-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5655; SKX-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5656; SKX-NEXT:    retq # sched: [7:1.00]
5657;
5658; ZNVER1-LABEL: test_psllvd_ymm:
5659; ZNVER1:       # %bb.0:
5660; ZNVER1-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5661; ZNVER1-NEXT:    vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5662; ZNVER1-NEXT:    retq # sched: [1:0.50]
5663  %1 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
5664  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
5665  %3 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %1, <8 x i32> %2)
5666  ret <8 x i32> %3
5667}
5668declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
5669
5670define <2 x i64> @test_psllvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
5671; GENERIC-LABEL: test_psllvq:
5672; GENERIC:       # %bb.0:
5673; GENERIC-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
5674; GENERIC-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
5675; GENERIC-NEXT:    retq # sched: [1:1.00]
5676;
5677; HASWELL-LABEL: test_psllvq:
5678; HASWELL:       # %bb.0:
5679; HASWELL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
5680; HASWELL-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
5681; HASWELL-NEXT:    retq # sched: [7:1.00]
5682;
5683; BROADWELL-LABEL: test_psllvq:
5684; BROADWELL:       # %bb.0:
5685; BROADWELL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
5686; BROADWELL-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
5687; BROADWELL-NEXT:    retq # sched: [7:1.00]
5688;
5689; SKYLAKE-LABEL: test_psllvq:
5690; SKYLAKE:       # %bb.0:
5691; SKYLAKE-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5692; SKYLAKE-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
5693; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5694;
5695; SKX-LABEL: test_psllvq:
5696; SKX:       # %bb.0:
5697; SKX-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5698; SKX-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
5699; SKX-NEXT:    retq # sched: [7:1.00]
5700;
5701; ZNVER1-LABEL: test_psllvq:
5702; ZNVER1:       # %bb.0:
5703; ZNVER1-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5704; ZNVER1-NEXT:    vpsllvq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
5705; ZNVER1-NEXT:    retq # sched: [1:0.50]
5706  %1 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
5707  %2 = load <2 x i64>, <2 x i64> *%a2, align 16
5708  %3 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %1, <2 x i64> %2)
5709  ret <2 x i64> %3
5710}
5711declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
5712
5713define <4 x i64> @test_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
5714; GENERIC-LABEL: test_psllvq_ymm:
5715; GENERIC:       # %bb.0:
5716; GENERIC-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5717; GENERIC-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5718; GENERIC-NEXT:    retq # sched: [1:1.00]
5719;
5720; HASWELL-LABEL: test_psllvq_ymm:
5721; HASWELL:       # %bb.0:
5722; HASWELL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5723; HASWELL-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5724; HASWELL-NEXT:    retq # sched: [7:1.00]
5725;
5726; BROADWELL-LABEL: test_psllvq_ymm:
5727; BROADWELL:       # %bb.0:
5728; BROADWELL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5729; BROADWELL-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
5730; BROADWELL-NEXT:    retq # sched: [7:1.00]
5731;
5732; SKYLAKE-LABEL: test_psllvq_ymm:
5733; SKYLAKE:       # %bb.0:
5734; SKYLAKE-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5735; SKYLAKE-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5736; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5737;
5738; SKX-LABEL: test_psllvq_ymm:
5739; SKX:       # %bb.0:
5740; SKX-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5741; SKX-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5742; SKX-NEXT:    retq # sched: [7:1.00]
5743;
5744; ZNVER1-LABEL: test_psllvq_ymm:
5745; ZNVER1:       # %bb.0:
5746; ZNVER1-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5747; ZNVER1-NEXT:    vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5748; ZNVER1-NEXT:    retq # sched: [1:0.50]
5749  %1 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
5750  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
5751  %3 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %1, <4 x i64> %2)
5752  ret <4 x i64> %3
5753}
5754declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
5755
5756define <16 x i16> @test_psllw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
5757; GENERIC-LABEL: test_psllw:
5758; GENERIC:       # %bb.0:
5759; GENERIC-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5760; GENERIC-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5761; GENERIC-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
5762; GENERIC-NEXT:    retq # sched: [1:1.00]
5763;
5764; HASWELL-LABEL: test_psllw:
5765; HASWELL:       # %bb.0:
5766; HASWELL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5767; HASWELL-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5768; HASWELL-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
5769; HASWELL-NEXT:    retq # sched: [7:1.00]
5770;
5771; BROADWELL-LABEL: test_psllw:
5772; BROADWELL:       # %bb.0:
5773; BROADWELL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5774; BROADWELL-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
5775; BROADWELL-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
5776; BROADWELL-NEXT:    retq # sched: [7:1.00]
5777;
5778; SKYLAKE-LABEL: test_psllw:
5779; SKYLAKE:       # %bb.0:
5780; SKYLAKE-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5781; SKYLAKE-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5782; SKYLAKE-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:0.50]
5783; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5784;
5785; SKX-LABEL: test_psllw:
5786; SKX:       # %bb.0:
5787; SKX-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5788; SKX-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5789; SKX-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:0.50]
5790; SKX-NEXT:    retq # sched: [7:1.00]
5791;
5792; ZNVER1-LABEL: test_psllw:
5793; ZNVER1:       # %bb.0:
5794; ZNVER1-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
5795; ZNVER1-NEXT:    vpsllw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
5796; ZNVER1-NEXT:    vpsllw $2, %ymm0, %ymm0 # sched: [1:0.25]
5797; ZNVER1-NEXT:    retq # sched: [1:0.50]
5798  %1 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
5799  %2 = load <8 x i16>, <8 x i16> *%a2, align 16
5800  %3 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %1, <8 x i16> %2)
5801  %4 = shl <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
5802  ret <16 x i16> %4
5803}
5804declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
5805
5806define <8 x i32> @test_psrad(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
5807; GENERIC-LABEL: test_psrad:
5808; GENERIC:       # %bb.0:
5809; GENERIC-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5810; GENERIC-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5811; GENERIC-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
5812; GENERIC-NEXT:    retq # sched: [1:1.00]
5813;
5814; HASWELL-LABEL: test_psrad:
5815; HASWELL:       # %bb.0:
5816; HASWELL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5817; HASWELL-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5818; HASWELL-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
5819; HASWELL-NEXT:    retq # sched: [7:1.00]
5820;
5821; BROADWELL-LABEL: test_psrad:
5822; BROADWELL:       # %bb.0:
5823; BROADWELL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5824; BROADWELL-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
5825; BROADWELL-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
5826; BROADWELL-NEXT:    retq # sched: [7:1.00]
5827;
5828; SKYLAKE-LABEL: test_psrad:
5829; SKYLAKE:       # %bb.0:
5830; SKYLAKE-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5831; SKYLAKE-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5832; SKYLAKE-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:0.50]
5833; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5834;
5835; SKX-LABEL: test_psrad:
5836; SKX:       # %bb.0:
5837; SKX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5838; SKX-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5839; SKX-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:0.50]
5840; SKX-NEXT:    retq # sched: [7:1.00]
5841;
5842; ZNVER1-LABEL: test_psrad:
5843; ZNVER1:       # %bb.0:
5844; ZNVER1-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
5845; ZNVER1-NEXT:    vpsrad (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
5846; ZNVER1-NEXT:    vpsrad $2, %ymm0, %ymm0 # sched: [1:0.25]
5847; ZNVER1-NEXT:    retq # sched: [1:0.50]
5848  %1 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
5849  %2 = load <4 x i32>, <4 x i32> *%a2, align 16
5850  %3 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> %2)
5851  %4 = ashr <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
5852  ret <8 x i32> %4
5853}
5854declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
5855
5856define <4 x i32> @test_psravd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
5857; GENERIC-LABEL: test_psravd:
5858; GENERIC:       # %bb.0:
5859; GENERIC-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
5860; GENERIC-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
5861; GENERIC-NEXT:    retq # sched: [1:1.00]
5862;
5863; HASWELL-LABEL: test_psravd:
5864; HASWELL:       # %bb.0:
5865; HASWELL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
5866; HASWELL-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
5867; HASWELL-NEXT:    retq # sched: [7:1.00]
5868;
5869; BROADWELL-LABEL: test_psravd:
5870; BROADWELL:       # %bb.0:
5871; BROADWELL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
5872; BROADWELL-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
5873; BROADWELL-NEXT:    retq # sched: [7:1.00]
5874;
5875; SKYLAKE-LABEL: test_psravd:
5876; SKYLAKE:       # %bb.0:
5877; SKYLAKE-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5878; SKYLAKE-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
5879; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5880;
5881; SKX-LABEL: test_psravd:
5882; SKX:       # %bb.0:
5883; SKX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5884; SKX-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
5885; SKX-NEXT:    retq # sched: [7:1.00]
5886;
5887; ZNVER1-LABEL: test_psravd:
5888; ZNVER1:       # %bb.0:
5889; ZNVER1-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
5890; ZNVER1-NEXT:    vpsravd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
5891; ZNVER1-NEXT:    retq # sched: [1:0.50]
5892  %1 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
5893  %2 = load <4 x i32>, <4 x i32> *%a2, align 16
5894  %3 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %1, <4 x i32> %2)
5895  ret <4 x i32> %3
5896}
5897declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
5898
5899define <8 x i32> @test_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
5900; GENERIC-LABEL: test_psravd_ymm:
5901; GENERIC:       # %bb.0:
5902; GENERIC-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
5903; GENERIC-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5904; GENERIC-NEXT:    retq # sched: [1:1.00]
5905;
5906; HASWELL-LABEL: test_psravd_ymm:
5907; HASWELL:       # %bb.0:
5908; HASWELL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
5909; HASWELL-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
5910; HASWELL-NEXT:    retq # sched: [7:1.00]
5911;
5912; BROADWELL-LABEL: test_psravd_ymm:
5913; BROADWELL:       # %bb.0:
5914; BROADWELL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
5915; BROADWELL-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
5916; BROADWELL-NEXT:    retq # sched: [7:1.00]
5917;
5918; SKYLAKE-LABEL: test_psravd_ymm:
5919; SKYLAKE:       # %bb.0:
5920; SKYLAKE-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5921; SKYLAKE-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5922; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5923;
5924; SKX-LABEL: test_psravd_ymm:
5925; SKX:       # %bb.0:
5926; SKX-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5927; SKX-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5928; SKX-NEXT:    retq # sched: [7:1.00]
5929;
5930; ZNVER1-LABEL: test_psravd_ymm:
5931; ZNVER1:       # %bb.0:
5932; ZNVER1-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
5933; ZNVER1-NEXT:    vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5934; ZNVER1-NEXT:    retq # sched: [1:0.50]
5935  %1 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
5936  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
5937  %3 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %1, <8 x i32> %2)
5938  ret <8 x i32> %3
5939}
5940declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
5941
5942define <16 x i16> @test_psraw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
5943; GENERIC-LABEL: test_psraw:
5944; GENERIC:       # %bb.0:
5945; GENERIC-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5946; GENERIC-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5947; GENERIC-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
5948; GENERIC-NEXT:    retq # sched: [1:1.00]
5949;
5950; HASWELL-LABEL: test_psraw:
5951; HASWELL:       # %bb.0:
5952; HASWELL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5953; HASWELL-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
5954; HASWELL-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
5955; HASWELL-NEXT:    retq # sched: [7:1.00]
5956;
5957; BROADWELL-LABEL: test_psraw:
5958; BROADWELL:       # %bb.0:
5959; BROADWELL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5960; BROADWELL-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
5961; BROADWELL-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
5962; BROADWELL-NEXT:    retq # sched: [7:1.00]
5963;
5964; SKYLAKE-LABEL: test_psraw:
5965; SKYLAKE:       # %bb.0:
5966; SKYLAKE-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5967; SKYLAKE-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5968; SKYLAKE-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:0.50]
5969; SKYLAKE-NEXT:    retq # sched: [7:1.00]
5970;
5971; SKX-LABEL: test_psraw:
5972; SKX:       # %bb.0:
5973; SKX-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5974; SKX-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
5975; SKX-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:0.50]
5976; SKX-NEXT:    retq # sched: [7:1.00]
5977;
5978; ZNVER1-LABEL: test_psraw:
5979; ZNVER1:       # %bb.0:
5980; ZNVER1-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
5981; ZNVER1-NEXT:    vpsraw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
5982; ZNVER1-NEXT:    vpsraw $2, %ymm0, %ymm0 # sched: [1:0.25]
5983; ZNVER1-NEXT:    retq # sched: [1:0.50]
5984  %1 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
5985  %2 = load <8 x i16>, <8 x i16> *%a2, align 16
5986  %3 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> %2)
5987  %4 = ashr <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
5988  ret <16 x i16> %4
5989}
5990declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
5991
5992define <8 x i32> @test_psrld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
5993; GENERIC-LABEL: test_psrld:
5994; GENERIC:       # %bb.0:
5995; GENERIC-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
5996; GENERIC-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
5997; GENERIC-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
5998; GENERIC-NEXT:    retq # sched: [1:1.00]
5999;
6000; HASWELL-LABEL: test_psrld:
6001; HASWELL:       # %bb.0:
6002; HASWELL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6003; HASWELL-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
6004; HASWELL-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
6005; HASWELL-NEXT:    retq # sched: [7:1.00]
6006;
6007; BROADWELL-LABEL: test_psrld:
6008; BROADWELL:       # %bb.0:
6009; BROADWELL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6010; BROADWELL-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
6011; BROADWELL-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
6012; BROADWELL-NEXT:    retq # sched: [7:1.00]
6013;
6014; SKYLAKE-LABEL: test_psrld:
6015; SKYLAKE:       # %bb.0:
6016; SKYLAKE-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6017; SKYLAKE-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6018; SKYLAKE-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:0.50]
6019; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6020;
6021; SKX-LABEL: test_psrld:
6022; SKX:       # %bb.0:
6023; SKX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6024; SKX-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6025; SKX-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:0.50]
6026; SKX-NEXT:    retq # sched: [7:1.00]
6027;
6028; ZNVER1-LABEL: test_psrld:
6029; ZNVER1:       # %bb.0:
6030; ZNVER1-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
6031; ZNVER1-NEXT:    vpsrld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
6032; ZNVER1-NEXT:    vpsrld $2, %ymm0, %ymm0 # sched: [1:0.25]
6033; ZNVER1-NEXT:    retq # sched: [1:0.50]
6034  %1 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
6035  %2 = load <4 x i32>, <4 x i32> *%a2, align 16
6036  %3 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %1, <4 x i32> %2)
6037  %4 = lshr <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
6038  ret <8 x i32> %4
6039}
6040declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
6041
6042define <32 x i8> @test_psrldq(<32 x i8> %a0) {
6043; GENERIC-LABEL: test_psrldq:
6044; GENERIC:       # %bb.0:
6045; GENERIC-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
6046; GENERIC-NEXT:    retq # sched: [1:1.00]
6047;
6048; HASWELL-LABEL: test_psrldq:
6049; HASWELL:       # %bb.0:
6050; HASWELL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
6051; HASWELL-NEXT:    retq # sched: [7:1.00]
6052;
6053; BROADWELL-LABEL: test_psrldq:
6054; BROADWELL:       # %bb.0:
6055; BROADWELL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
6056; BROADWELL-NEXT:    retq # sched: [7:1.00]
6057;
6058; SKYLAKE-LABEL: test_psrldq:
6059; SKYLAKE:       # %bb.0:
6060; SKYLAKE-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
6061; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6062;
6063; SKX-LABEL: test_psrldq:
6064; SKX:       # %bb.0:
6065; SKX-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
6066; SKX-NEXT:    retq # sched: [7:1.00]
6067;
6068; ZNVER1-LABEL: test_psrldq:
6069; ZNVER1:       # %bb.0:
6070; ZNVER1-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [2:1.00]
6071; ZNVER1-NEXT:    retq # sched: [1:0.50]
6072  %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
6073  ret <32 x i8> %1
6074}
6075
6076define <4 x i64> @test_psrlq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
6077; GENERIC-LABEL: test_psrlq:
6078; GENERIC:       # %bb.0:
6079; GENERIC-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6080; GENERIC-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
6081; GENERIC-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
6082; GENERIC-NEXT:    retq # sched: [1:1.00]
6083;
6084; HASWELL-LABEL: test_psrlq:
6085; HASWELL:       # %bb.0:
6086; HASWELL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6087; HASWELL-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
6088; HASWELL-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
6089; HASWELL-NEXT:    retq # sched: [7:1.00]
6090;
6091; BROADWELL-LABEL: test_psrlq:
6092; BROADWELL:       # %bb.0:
6093; BROADWELL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6094; BROADWELL-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
6095; BROADWELL-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
6096; BROADWELL-NEXT:    retq # sched: [7:1.00]
6097;
6098; SKYLAKE-LABEL: test_psrlq:
6099; SKYLAKE:       # %bb.0:
6100; SKYLAKE-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6101; SKYLAKE-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6102; SKYLAKE-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.50]
6103; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6104;
6105; SKX-LABEL: test_psrlq:
6106; SKX:       # %bb.0:
6107; SKX-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6108; SKX-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6109; SKX-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.50]
6110; SKX-NEXT:    retq # sched: [7:1.00]
6111;
6112; ZNVER1-LABEL: test_psrlq:
6113; ZNVER1:       # %bb.0:
6114; ZNVER1-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
6115; ZNVER1-NEXT:    vpsrlq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
6116; ZNVER1-NEXT:    vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.25]
6117; ZNVER1-NEXT:    retq # sched: [1:0.50]
6118  %1 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
6119  %2 = load <2 x i64>, <2 x i64> *%a2, align 16
6120  %3 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %1, <2 x i64> %2)
6121  %4 = lshr <4 x i64> %3, <i64 2, i64 2, i64 2, i64 2>
6122  ret <4 x i64> %4
6123}
6124declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
6125
6126define <4 x i32> @test_psrlvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
6127; GENERIC-LABEL: test_psrlvd:
6128; GENERIC:       # %bb.0:
6129; GENERIC-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
6130; GENERIC-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
6131; GENERIC-NEXT:    retq # sched: [1:1.00]
6132;
6133; HASWELL-LABEL: test_psrlvd:
6134; HASWELL:       # %bb.0:
6135; HASWELL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
6136; HASWELL-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
6137; HASWELL-NEXT:    retq # sched: [7:1.00]
6138;
6139; BROADWELL-LABEL: test_psrlvd:
6140; BROADWELL:       # %bb.0:
6141; BROADWELL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
6142; BROADWELL-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
6143; BROADWELL-NEXT:    retq # sched: [7:1.00]
6144;
6145; SKYLAKE-LABEL: test_psrlvd:
6146; SKYLAKE:       # %bb.0:
6147; SKYLAKE-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
6148; SKYLAKE-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
6149; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6150;
6151; SKX-LABEL: test_psrlvd:
6152; SKX:       # %bb.0:
6153; SKX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
6154; SKX-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
6155; SKX-NEXT:    retq # sched: [7:1.00]
6156;
6157; ZNVER1-LABEL: test_psrlvd:
6158; ZNVER1:       # %bb.0:
6159; ZNVER1-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
6160; ZNVER1-NEXT:    vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
6161; ZNVER1-NEXT:    retq # sched: [1:0.50]
6162  %1 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
6163  %2 = load <4 x i32>, <4 x i32> *%a2, align 16
6164  %3 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %1, <4 x i32> %2)
6165  ret <4 x i32> %3
6166}
6167declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
6168
6169define <8 x i32> @test_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
6170; GENERIC-LABEL: test_psrlvd_ymm:
6171; GENERIC:       # %bb.0:
6172; GENERIC-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
6173; GENERIC-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
6174; GENERIC-NEXT:    retq # sched: [1:1.00]
6175;
6176; HASWELL-LABEL: test_psrlvd_ymm:
6177; HASWELL:       # %bb.0:
6178; HASWELL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
6179; HASWELL-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
6180; HASWELL-NEXT:    retq # sched: [7:1.00]
6181;
6182; BROADWELL-LABEL: test_psrlvd_ymm:
6183; BROADWELL:       # %bb.0:
6184; BROADWELL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
6185; BROADWELL-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
6186; BROADWELL-NEXT:    retq # sched: [7:1.00]
6187;
6188; SKYLAKE-LABEL: test_psrlvd_ymm:
6189; SKYLAKE:       # %bb.0:
6190; SKYLAKE-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6191; SKYLAKE-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6192; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6193;
6194; SKX-LABEL: test_psrlvd_ymm:
6195; SKX:       # %bb.0:
6196; SKX-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6197; SKX-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6198; SKX-NEXT:    retq # sched: [7:1.00]
6199;
6200; ZNVER1-LABEL: test_psrlvd_ymm:
6201; ZNVER1:       # %bb.0:
6202; ZNVER1-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6203; ZNVER1-NEXT:    vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6204; ZNVER1-NEXT:    retq # sched: [1:0.50]
6205  %1 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
6206  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
6207  %3 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %1, <8 x i32> %2)
6208  ret <8 x i32> %3
6209}
6210declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
6211
6212define <2 x i64> @test_psrlvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
6213; GENERIC-LABEL: test_psrlvq:
6214; GENERIC:       # %bb.0:
6215; GENERIC-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
6216; GENERIC-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
6217; GENERIC-NEXT:    retq # sched: [1:1.00]
6218;
6219; HASWELL-LABEL: test_psrlvq:
6220; HASWELL:       # %bb.0:
6221; HASWELL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
6222; HASWELL-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
6223; HASWELL-NEXT:    retq # sched: [7:1.00]
6224;
6225; BROADWELL-LABEL: test_psrlvq:
6226; BROADWELL:       # %bb.0:
6227; BROADWELL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
6228; BROADWELL-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
6229; BROADWELL-NEXT:    retq # sched: [7:1.00]
6230;
6231; SKYLAKE-LABEL: test_psrlvq:
6232; SKYLAKE:       # %bb.0:
6233; SKYLAKE-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
6234; SKYLAKE-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
6235; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6236;
6237; SKX-LABEL: test_psrlvq:
6238; SKX:       # %bb.0:
6239; SKX-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
6240; SKX-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
6241; SKX-NEXT:    retq # sched: [7:1.00]
6242;
6243; ZNVER1-LABEL: test_psrlvq:
6244; ZNVER1:       # %bb.0:
6245; ZNVER1-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
6246; ZNVER1-NEXT:    vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
6247; ZNVER1-NEXT:    retq # sched: [1:0.50]
6248  %1 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
6249  %2 = load <2 x i64>, <2 x i64> *%a2, align 16
6250  %3 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %1, <2 x i64> %2)
6251  ret <2 x i64> %3
6252}
6253declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
6254
6255define <4 x i64> @test_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
6256; GENERIC-LABEL: test_psrlvq_ymm:
6257; GENERIC:       # %bb.0:
6258; GENERIC-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
6259; GENERIC-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
6260; GENERIC-NEXT:    retq # sched: [1:1.00]
6261;
6262; HASWELL-LABEL: test_psrlvq_ymm:
6263; HASWELL:       # %bb.0:
6264; HASWELL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
6265; HASWELL-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
6266; HASWELL-NEXT:    retq # sched: [7:1.00]
6267;
6268; BROADWELL-LABEL: test_psrlvq_ymm:
6269; BROADWELL:       # %bb.0:
6270; BROADWELL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
6271; BROADWELL-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
6272; BROADWELL-NEXT:    retq # sched: [7:1.00]
6273;
6274; SKYLAKE-LABEL: test_psrlvq_ymm:
6275; SKYLAKE:       # %bb.0:
6276; SKYLAKE-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6277; SKYLAKE-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6278; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6279;
6280; SKX-LABEL: test_psrlvq_ymm:
6281; SKX:       # %bb.0:
6282; SKX-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6283; SKX-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6284; SKX-NEXT:    retq # sched: [7:1.00]
6285;
6286; ZNVER1-LABEL: test_psrlvq_ymm:
6287; ZNVER1:       # %bb.0:
6288; ZNVER1-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6289; ZNVER1-NEXT:    vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6290; ZNVER1-NEXT:    retq # sched: [1:0.50]
6291  %1 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
6292  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
6293  %3 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %1, <4 x i64> %2)
6294  ret <4 x i64> %3
6295}
6296declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
6297
6298define <16 x i16> @test_psrlw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
6299; GENERIC-LABEL: test_psrlw:
6300; GENERIC:       # %bb.0:
6301; GENERIC-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6302; GENERIC-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
6303; GENERIC-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
6304; GENERIC-NEXT:    retq # sched: [1:1.00]
6305;
6306; HASWELL-LABEL: test_psrlw:
6307; HASWELL:       # %bb.0:
6308; HASWELL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6309; HASWELL-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
6310; HASWELL-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
6311; HASWELL-NEXT:    retq # sched: [7:1.00]
6312;
6313; BROADWELL-LABEL: test_psrlw:
6314; BROADWELL:       # %bb.0:
6315; BROADWELL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6316; BROADWELL-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
6317; BROADWELL-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
6318; BROADWELL-NEXT:    retq # sched: [7:1.00]
6319;
6320; SKYLAKE-LABEL: test_psrlw:
6321; SKYLAKE:       # %bb.0:
6322; SKYLAKE-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6323; SKYLAKE-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6324; SKYLAKE-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.50]
6325; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6326;
6327; SKX-LABEL: test_psrlw:
6328; SKX:       # %bb.0:
6329; SKX-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
6330; SKX-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6331; SKX-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.50]
6332; SKX-NEXT:    retq # sched: [7:1.00]
6333;
6334; ZNVER1-LABEL: test_psrlw:
6335; ZNVER1:       # %bb.0:
6336; ZNVER1-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
6337; ZNVER1-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
6338; ZNVER1-NEXT:    vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.25]
6339; ZNVER1-NEXT:    retq # sched: [1:0.50]
6340  %1 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
6341  %2 = load <8 x i16>, <8 x i16> *%a2, align 16
6342  %3 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %1, <8 x i16> %2)
6343  %4 = lshr <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
6344  ret <16 x i16> %4
6345}
6346declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
6347
6348define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
6349; GENERIC-LABEL: test_psubb:
6350; GENERIC:       # %bb.0:
6351; GENERIC-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6352; GENERIC-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6353; GENERIC-NEXT:    retq # sched: [1:1.00]
6354;
6355; HASWELL-LABEL: test_psubb:
6356; HASWELL:       # %bb.0:
6357; HASWELL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6358; HASWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6359; HASWELL-NEXT:    retq # sched: [7:1.00]
6360;
6361; BROADWELL-LABEL: test_psubb:
6362; BROADWELL:       # %bb.0:
6363; BROADWELL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6364; BROADWELL-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6365; BROADWELL-NEXT:    retq # sched: [7:1.00]
6366;
6367; SKYLAKE-LABEL: test_psubb:
6368; SKYLAKE:       # %bb.0:
6369; SKYLAKE-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6370; SKYLAKE-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6371; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6372;
6373; SKX-LABEL: test_psubb:
6374; SKX:       # %bb.0:
6375; SKX-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6376; SKX-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6377; SKX-NEXT:    retq # sched: [7:1.00]
6378;
6379; ZNVER1-LABEL: test_psubb:
6380; ZNVER1:       # %bb.0:
6381; ZNVER1-NEXT:    vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6382; ZNVER1-NEXT:    vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6383; ZNVER1-NEXT:    retq # sched: [1:0.50]
6384  %1 = sub <32 x i8> %a0, %a1
6385  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
6386  %3 = sub <32 x i8> %1, %2
6387  ret <32 x i8> %3
6388}
6389
6390define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
6391; GENERIC-LABEL: test_psubd:
6392; GENERIC:       # %bb.0:
6393; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6394; GENERIC-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6395; GENERIC-NEXT:    retq # sched: [1:1.00]
6396;
6397; HASWELL-LABEL: test_psubd:
6398; HASWELL:       # %bb.0:
6399; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6400; HASWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6401; HASWELL-NEXT:    retq # sched: [7:1.00]
6402;
6403; BROADWELL-LABEL: test_psubd:
6404; BROADWELL:       # %bb.0:
6405; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6406; BROADWELL-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6407; BROADWELL-NEXT:    retq # sched: [7:1.00]
6408;
6409; SKYLAKE-LABEL: test_psubd:
6410; SKYLAKE:       # %bb.0:
6411; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6412; SKYLAKE-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6413; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6414;
6415; SKX-LABEL: test_psubd:
6416; SKX:       # %bb.0:
6417; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6418; SKX-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6419; SKX-NEXT:    retq # sched: [7:1.00]
6420;
6421; ZNVER1-LABEL: test_psubd:
6422; ZNVER1:       # %bb.0:
6423; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6424; ZNVER1-NEXT:    vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6425; ZNVER1-NEXT:    retq # sched: [1:0.50]
6426  %1 = sub <8 x i32> %a0, %a1
6427  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
6428  %3 = sub <8 x i32> %1, %2
6429  ret <8 x i32> %3
6430}
6431
6432define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
6433; GENERIC-LABEL: test_psubq:
6434; GENERIC:       # %bb.0:
6435; GENERIC-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6436; GENERIC-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6437; GENERIC-NEXT:    retq # sched: [1:1.00]
6438;
6439; HASWELL-LABEL: test_psubq:
6440; HASWELL:       # %bb.0:
6441; HASWELL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6442; HASWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6443; HASWELL-NEXT:    retq # sched: [7:1.00]
6444;
6445; BROADWELL-LABEL: test_psubq:
6446; BROADWELL:       # %bb.0:
6447; BROADWELL-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6448; BROADWELL-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6449; BROADWELL-NEXT:    retq # sched: [7:1.00]
6450;
6451; SKYLAKE-LABEL: test_psubq:
6452; SKYLAKE:       # %bb.0:
6453; SKYLAKE-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6454; SKYLAKE-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6455; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6456;
6457; SKX-LABEL: test_psubq:
6458; SKX:       # %bb.0:
6459; SKX-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6460; SKX-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6461; SKX-NEXT:    retq # sched: [7:1.00]
6462;
6463; ZNVER1-LABEL: test_psubq:
6464; ZNVER1:       # %bb.0:
6465; ZNVER1-NEXT:    vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6466; ZNVER1-NEXT:    vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6467; ZNVER1-NEXT:    retq # sched: [1:0.50]
6468  %1 = sub <4 x i64> %a0, %a1
6469  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
6470  %3 = sub <4 x i64> %1, %2
6471  ret <4 x i64> %3
6472}
6473
6474define <32 x i8> @test_psubsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
6475; GENERIC-LABEL: test_psubsb:
6476; GENERIC:       # %bb.0:
6477; GENERIC-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6478; GENERIC-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6479; GENERIC-NEXT:    retq # sched: [1:1.00]
6480;
6481; HASWELL-LABEL: test_psubsb:
6482; HASWELL:       # %bb.0:
6483; HASWELL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6484; HASWELL-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6485; HASWELL-NEXT:    retq # sched: [7:1.00]
6486;
6487; BROADWELL-LABEL: test_psubsb:
6488; BROADWELL:       # %bb.0:
6489; BROADWELL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6490; BROADWELL-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6491; BROADWELL-NEXT:    retq # sched: [7:1.00]
6492;
6493; SKYLAKE-LABEL: test_psubsb:
6494; SKYLAKE:       # %bb.0:
6495; SKYLAKE-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6496; SKYLAKE-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6497; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6498;
6499; SKX-LABEL: test_psubsb:
6500; SKX:       # %bb.0:
6501; SKX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6502; SKX-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6503; SKX-NEXT:    retq # sched: [7:1.00]
6504;
6505; ZNVER1-LABEL: test_psubsb:
6506; ZNVER1:       # %bb.0:
6507; ZNVER1-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6508; ZNVER1-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6509; ZNVER1-NEXT:    retq # sched: [1:0.50]
6510  %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1)
6511  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
6512  %3 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %1, <32 x i8> %2)
6513  ret <32 x i8> %3
6514}
6515declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
6516
6517define <16 x i16> @test_psubsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
6518; GENERIC-LABEL: test_psubsw:
6519; GENERIC:       # %bb.0:
6520; GENERIC-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6521; GENERIC-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6522; GENERIC-NEXT:    retq # sched: [1:1.00]
6523;
6524; HASWELL-LABEL: test_psubsw:
6525; HASWELL:       # %bb.0:
6526; HASWELL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6527; HASWELL-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6528; HASWELL-NEXT:    retq # sched: [7:1.00]
6529;
6530; BROADWELL-LABEL: test_psubsw:
6531; BROADWELL:       # %bb.0:
6532; BROADWELL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6533; BROADWELL-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6534; BROADWELL-NEXT:    retq # sched: [7:1.00]
6535;
6536; SKYLAKE-LABEL: test_psubsw:
6537; SKYLAKE:       # %bb.0:
6538; SKYLAKE-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6539; SKYLAKE-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6540; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6541;
6542; SKX-LABEL: test_psubsw:
6543; SKX:       # %bb.0:
6544; SKX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6545; SKX-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6546; SKX-NEXT:    retq # sched: [7:1.00]
6547;
6548; ZNVER1-LABEL: test_psubsw:
6549; ZNVER1:       # %bb.0:
6550; ZNVER1-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6551; ZNVER1-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6552; ZNVER1-NEXT:    retq # sched: [1:0.50]
6553  %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1)
6554  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
6555  %3 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %1, <16 x i16> %2)
6556  ret <16 x i16> %3
6557}
6558declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
6559
6560define <32 x i8> @test_psubusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
6561; GENERIC-LABEL: test_psubusb:
6562; GENERIC:       # %bb.0:
6563; GENERIC-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6564; GENERIC-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6565; GENERIC-NEXT:    retq # sched: [1:1.00]
6566;
6567; HASWELL-LABEL: test_psubusb:
6568; HASWELL:       # %bb.0:
6569; HASWELL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6570; HASWELL-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6571; HASWELL-NEXT:    retq # sched: [7:1.00]
6572;
6573; BROADWELL-LABEL: test_psubusb:
6574; BROADWELL:       # %bb.0:
6575; BROADWELL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6576; BROADWELL-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6577; BROADWELL-NEXT:    retq # sched: [7:1.00]
6578;
6579; SKYLAKE-LABEL: test_psubusb:
6580; SKYLAKE:       # %bb.0:
6581; SKYLAKE-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6582; SKYLAKE-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6583; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6584;
6585; SKX-LABEL: test_psubusb:
6586; SKX:       # %bb.0:
6587; SKX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6588; SKX-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6589; SKX-NEXT:    retq # sched: [7:1.00]
6590;
6591; ZNVER1-LABEL: test_psubusb:
6592; ZNVER1:       # %bb.0:
6593; ZNVER1-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6594; ZNVER1-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6595; ZNVER1-NEXT:    retq # sched: [1:0.50]
6596  %1 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
6597  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
6598  %3 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %1, <32 x i8> %2)
6599  ret <32 x i8> %3
6600}
6601declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
6602
6603define <16 x i16> @test_psubusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
6604; GENERIC-LABEL: test_psubusw:
6605; GENERIC:       # %bb.0:
6606; GENERIC-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6607; GENERIC-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6608; GENERIC-NEXT:    retq # sched: [1:1.00]
6609;
6610; HASWELL-LABEL: test_psubusw:
6611; HASWELL:       # %bb.0:
6612; HASWELL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6613; HASWELL-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6614; HASWELL-NEXT:    retq # sched: [7:1.00]
6615;
6616; BROADWELL-LABEL: test_psubusw:
6617; BROADWELL:       # %bb.0:
6618; BROADWELL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6619; BROADWELL-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6620; BROADWELL-NEXT:    retq # sched: [7:1.00]
6621;
6622; SKYLAKE-LABEL: test_psubusw:
6623; SKYLAKE:       # %bb.0:
6624; SKYLAKE-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6625; SKYLAKE-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6626; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6627;
6628; SKX-LABEL: test_psubusw:
6629; SKX:       # %bb.0:
6630; SKX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6631; SKX-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6632; SKX-NEXT:    retq # sched: [7:1.00]
6633;
6634; ZNVER1-LABEL: test_psubusw:
6635; ZNVER1:       # %bb.0:
6636; ZNVER1-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6637; ZNVER1-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6638; ZNVER1-NEXT:    retq # sched: [1:0.50]
6639  %1 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
6640  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
6641  %3 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %1, <16 x i16> %2)
6642  ret <16 x i16> %3
6643}
6644declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
6645
6646define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
6647; GENERIC-LABEL: test_psubw:
6648; GENERIC:       # %bb.0:
6649; GENERIC-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6650; GENERIC-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6651; GENERIC-NEXT:    retq # sched: [1:1.00]
6652;
6653; HASWELL-LABEL: test_psubw:
6654; HASWELL:       # %bb.0:
6655; HASWELL-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6656; HASWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6657; HASWELL-NEXT:    retq # sched: [7:1.00]
6658;
6659; BROADWELL-LABEL: test_psubw:
6660; BROADWELL:       # %bb.0:
6661; BROADWELL-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6662; BROADWELL-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
6663; BROADWELL-NEXT:    retq # sched: [7:1.00]
6664;
6665; SKYLAKE-LABEL: test_psubw:
6666; SKYLAKE:       # %bb.0:
6667; SKYLAKE-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6668; SKYLAKE-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6669; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6670;
6671; SKX-LABEL: test_psubw:
6672; SKX:       # %bb.0:
6673; SKX-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6674; SKX-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6675; SKX-NEXT:    retq # sched: [7:1.00]
6676;
6677; ZNVER1-LABEL: test_psubw:
6678; ZNVER1:       # %bb.0:
6679; ZNVER1-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6680; ZNVER1-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
6681; ZNVER1-NEXT:    retq # sched: [1:0.50]
6682  %1 = sub <16 x i16> %a0, %a1
6683  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
6684  %3 = sub <16 x i16> %1, %2
6685  ret <16 x i16> %3
6686}
6687
6688define <32 x i8> @test_punpckhbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
6689; GENERIC-LABEL: test_punpckhbw:
6690; GENERIC:       # %bb.0:
6691; GENERIC-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
6692; GENERIC-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
6693; GENERIC-NEXT:    retq # sched: [1:1.00]
6694;
6695; HASWELL-LABEL: test_punpckhbw:
6696; HASWELL:       # %bb.0:
6697; HASWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
6698; HASWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
6699; HASWELL-NEXT:    retq # sched: [7:1.00]
6700;
6701; BROADWELL-LABEL: test_punpckhbw:
6702; BROADWELL:       # %bb.0:
6703; BROADWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
6704; BROADWELL-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [7:1.00]
6705; BROADWELL-NEXT:    retq # sched: [7:1.00]
6706;
6707; SKYLAKE-LABEL: test_punpckhbw:
6708; SKYLAKE:       # %bb.0:
6709; SKYLAKE-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
6710; SKYLAKE-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
6711; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6712;
6713; SKX-LABEL: test_punpckhbw:
6714; SKX:       # %bb.0:
6715; SKX-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
6716; SKX-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
6717; SKX-NEXT:    retq # sched: [7:1.00]
6718;
6719; ZNVER1-LABEL: test_punpckhbw:
6720; ZNVER1:       # %bb.0:
6721; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:0.25]
6722; ZNVER1-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:0.50]
6723; ZNVER1-NEXT:    retq # sched: [1:0.50]
6724  %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
6725  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
6726  %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
6727  ret <32 x i8> %3
6728}
6729
6730define <8 x i32> @test_punpckhdq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
6731; GENERIC-LABEL: test_punpckhdq:
6732; GENERIC:       # %bb.0:
6733; GENERIC-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
6734; GENERIC-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
6735; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6736; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6737; GENERIC-NEXT:    retq # sched: [1:1.00]
6738;
6739; HASWELL-LABEL: test_punpckhdq:
6740; HASWELL:       # %bb.0:
6741; HASWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
6742; HASWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
6743; HASWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6744; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6745; HASWELL-NEXT:    retq # sched: [7:1.00]
6746;
6747; BROADWELL-LABEL: test_punpckhdq:
6748; BROADWELL:       # %bb.0:
6749; BROADWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
6750; BROADWELL-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
6751; BROADWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6752; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6753; BROADWELL-NEXT:    retq # sched: [7:1.00]
6754;
6755; SKYLAKE-LABEL: test_punpckhdq:
6756; SKYLAKE:       # %bb.0:
6757; SKYLAKE-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
6758; SKYLAKE-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
6759; SKYLAKE-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6760; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6761; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6762;
6763; SKX-LABEL: test_punpckhdq:
6764; SKX:       # %bb.0:
6765; SKX-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
6766; SKX-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
6767; SKX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6768; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6769; SKX-NEXT:    retq # sched: [7:1.00]
6770;
6771; ZNVER1-LABEL: test_punpckhdq:
6772; ZNVER1:       # %bb.0:
6773; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.25]
6774; ZNVER1-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50]
6775; ZNVER1-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.25]
6776; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6777; ZNVER1-NEXT:    retq # sched: [1:0.50]
6778  %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
6779  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
6780  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
6781  %4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
6782  ret <8 x i32> %4
6783}
6784
6785define <4 x i64> @test_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
6786; GENERIC-LABEL: test_punpckhqdq:
6787; GENERIC:       # %bb.0:
6788; GENERIC-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
6789; GENERIC-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
6790; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
6791; GENERIC-NEXT:    retq # sched: [1:1.00]
6792;
6793; HASWELL-LABEL: test_punpckhqdq:
6794; HASWELL:       # %bb.0:
6795; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
6796; HASWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
6797; HASWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
6798; HASWELL-NEXT:    retq # sched: [7:1.00]
6799;
6800; BROADWELL-LABEL: test_punpckhqdq:
6801; BROADWELL:       # %bb.0:
6802; BROADWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
6803; BROADWELL-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
6804; BROADWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
6805; BROADWELL-NEXT:    retq # sched: [7:1.00]
6806;
6807; SKYLAKE-LABEL: test_punpckhqdq:
6808; SKYLAKE:       # %bb.0:
6809; SKYLAKE-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
6810; SKYLAKE-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
6811; SKYLAKE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
6812; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6813;
6814; SKX-LABEL: test_punpckhqdq:
6815; SKX:       # %bb.0:
6816; SKX-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
6817; SKX-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
6818; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
6819; SKX-NEXT:    retq # sched: [7:1.00]
6820;
6821; ZNVER1-LABEL: test_punpckhqdq:
6822; ZNVER1:       # %bb.0:
6823; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.25]
6824; ZNVER1-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:0.50]
6825; ZNVER1-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
6826; ZNVER1-NEXT:    retq # sched: [1:0.50]
6827  %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
6828  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
6829  %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
6830  %4 = add <4 x i64> %1, %3
6831  ret <4 x i64> %4
6832}
6833
6834define <16 x i16> @test_punpckhwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
6835; GENERIC-LABEL: test_punpckhwd:
6836; GENERIC:       # %bb.0:
6837; GENERIC-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
6838; GENERIC-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
6839; GENERIC-NEXT:    retq # sched: [1:1.00]
6840;
6841; HASWELL-LABEL: test_punpckhwd:
6842; HASWELL:       # %bb.0:
6843; HASWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
6844; HASWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
6845; HASWELL-NEXT:    retq # sched: [7:1.00]
6846;
6847; BROADWELL-LABEL: test_punpckhwd:
6848; BROADWELL:       # %bb.0:
6849; BROADWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
6850; BROADWELL-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [7:1.00]
6851; BROADWELL-NEXT:    retq # sched: [7:1.00]
6852;
6853; SKYLAKE-LABEL: test_punpckhwd:
6854; SKYLAKE:       # %bb.0:
6855; SKYLAKE-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
6856; SKYLAKE-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
6857; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6858;
6859; SKX-LABEL: test_punpckhwd:
6860; SKX:       # %bb.0:
6861; SKX-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
6862; SKX-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
6863; SKX-NEXT:    retq # sched: [7:1.00]
6864;
6865; ZNVER1-LABEL: test_punpckhwd:
6866; ZNVER1:       # %bb.0:
6867; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:0.25]
6868; ZNVER1-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:0.50]
6869; ZNVER1-NEXT:    retq # sched: [1:0.50]
6870  %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
6871  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
6872  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
6873  ret <16 x i16> %3
6874}
6875
6876define <32 x i8> @test_punpcklbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
6877; GENERIC-LABEL: test_punpcklbw:
6878; GENERIC:       # %bb.0:
6879; GENERIC-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
6880; GENERIC-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
6881; GENERIC-NEXT:    retq # sched: [1:1.00]
6882;
6883; HASWELL-LABEL: test_punpcklbw:
6884; HASWELL:       # %bb.0:
6885; HASWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
6886; HASWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
6887; HASWELL-NEXT:    retq # sched: [7:1.00]
6888;
6889; BROADWELL-LABEL: test_punpcklbw:
6890; BROADWELL:       # %bb.0:
6891; BROADWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
6892; BROADWELL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [7:1.00]
6893; BROADWELL-NEXT:    retq # sched: [7:1.00]
6894;
6895; SKYLAKE-LABEL: test_punpcklbw:
6896; SKYLAKE:       # %bb.0:
6897; SKYLAKE-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
6898; SKYLAKE-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
6899; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6900;
6901; SKX-LABEL: test_punpcklbw:
6902; SKX:       # %bb.0:
6903; SKX-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
6904; SKX-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
6905; SKX-NEXT:    retq # sched: [7:1.00]
6906;
6907; ZNVER1-LABEL: test_punpcklbw:
6908; ZNVER1:       # %bb.0:
6909; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:0.25]
6910; ZNVER1-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:0.50]
6911; ZNVER1-NEXT:    retq # sched: [1:0.50]
6912  %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
6913  %2 = load <32 x i8>, <32 x i8> *%a2, align 32
6914  %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
6915  ret <32 x i8> %3
6916}
6917
6918define <8 x i32> @test_punpckldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
6919; GENERIC-LABEL: test_punpckldq:
6920; GENERIC:       # %bb.0:
6921; GENERIC-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
6922; GENERIC-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
6923; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6924; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6925; GENERIC-NEXT:    retq # sched: [1:1.00]
6926;
6927; HASWELL-LABEL: test_punpckldq:
6928; HASWELL:       # %bb.0:
6929; HASWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
6930; HASWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
6931; HASWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6932; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6933; HASWELL-NEXT:    retq # sched: [7:1.00]
6934;
6935; BROADWELL-LABEL: test_punpckldq:
6936; BROADWELL:       # %bb.0:
6937; BROADWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
6938; BROADWELL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
6939; BROADWELL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6940; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
6941; BROADWELL-NEXT:    retq # sched: [7:1.00]
6942;
6943; SKYLAKE-LABEL: test_punpckldq:
6944; SKYLAKE:       # %bb.0:
6945; SKYLAKE-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
6946; SKYLAKE-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
6947; SKYLAKE-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6948; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6949; SKYLAKE-NEXT:    retq # sched: [7:1.00]
6950;
6951; SKX-LABEL: test_punpckldq:
6952; SKX:       # %bb.0:
6953; SKX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
6954; SKX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
6955; SKX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
6956; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
6957; SKX-NEXT:    retq # sched: [7:1.00]
6958;
6959; ZNVER1-LABEL: test_punpckldq:
6960; ZNVER1:       # %bb.0:
6961; ZNVER1-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.25]
6962; ZNVER1-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50]
6963; ZNVER1-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.25]
6964; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
6965; ZNVER1-NEXT:    retq # sched: [1:0.50]
6966  %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
6967  %2 = load <8 x i32>, <8 x i32> *%a2, align 32
6968  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
6969  %4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
6970  ret <8 x i32> %4
6971}
6972
6973define <4 x i64> @test_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
6974; GENERIC-LABEL: test_punpcklqdq:
6975; GENERIC:       # %bb.0:
6976; GENERIC-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
6977; GENERIC-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
6978; GENERIC-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
6979; GENERIC-NEXT:    retq # sched: [1:1.00]
6980;
6981; HASWELL-LABEL: test_punpcklqdq:
6982; HASWELL:       # %bb.0:
6983; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
6984; HASWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
6985; HASWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
6986; HASWELL-NEXT:    retq # sched: [7:1.00]
6987;
6988; BROADWELL-LABEL: test_punpcklqdq:
6989; BROADWELL:       # %bb.0:
6990; BROADWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
6991; BROADWELL-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
6992; BROADWELL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
6993; BROADWELL-NEXT:    retq # sched: [7:1.00]
6994;
6995; SKYLAKE-LABEL: test_punpcklqdq:
6996; SKYLAKE:       # %bb.0:
6997; SKYLAKE-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
6998; SKYLAKE-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
6999; SKYLAKE-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
7000; SKYLAKE-NEXT:    retq # sched: [7:1.00]
7001;
7002; SKX-LABEL: test_punpcklqdq:
7003; SKX:       # %bb.0:
7004; SKX-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
7005; SKX-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
7006; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
7007; SKX-NEXT:    retq # sched: [7:1.00]
7008;
7009; ZNVER1-LABEL: test_punpcklqdq:
7010; ZNVER1:       # %bb.0:
7011; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.25]
7012; ZNVER1-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:0.50]
7013; ZNVER1-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
7014; ZNVER1-NEXT:    retq # sched: [1:0.50]
7015  %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
7016  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
7017  %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
7018  %4 = add <4 x i64> %1, %3
7019  ret <4 x i64> %4
7020}
7021
7022define <16 x i16> @test_punpcklwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
7023; GENERIC-LABEL: test_punpcklwd:
7024; GENERIC:       # %bb.0:
7025; GENERIC-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
7026; GENERIC-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
7027; GENERIC-NEXT:    retq # sched: [1:1.00]
7028;
7029; HASWELL-LABEL: test_punpcklwd:
7030; HASWELL:       # %bb.0:
7031; HASWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
7032; HASWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
7033; HASWELL-NEXT:    retq # sched: [7:1.00]
7034;
7035; BROADWELL-LABEL: test_punpcklwd:
7036; BROADWELL:       # %bb.0:
7037; BROADWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
7038; BROADWELL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [7:1.00]
7039; BROADWELL-NEXT:    retq # sched: [7:1.00]
7040;
7041; SKYLAKE-LABEL: test_punpcklwd:
7042; SKYLAKE:       # %bb.0:
7043; SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
7044; SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
7045; SKYLAKE-NEXT:    retq # sched: [7:1.00]
7046;
7047; SKX-LABEL: test_punpcklwd:
7048; SKX:       # %bb.0:
7049; SKX-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
7050; SKX-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
7051; SKX-NEXT:    retq # sched: [7:1.00]
7052;
7053; ZNVER1-LABEL: test_punpcklwd:
7054; ZNVER1:       # %bb.0:
7055; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:0.25]
7056; ZNVER1-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:0.50]
7057; ZNVER1-NEXT:    retq # sched: [1:0.50]
7058  %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
7059  %2 = load <16 x i16>, <16 x i16> *%a2, align 32
7060  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
7061  ret <16 x i16> %3
7062}
7063
7064define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
7065; GENERIC-LABEL: test_pxor:
7066; GENERIC:       # %bb.0:
7067; GENERIC-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
7068; GENERIC-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
7069; GENERIC-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
7070; GENERIC-NEXT:    retq # sched: [1:1.00]
7071;
7072; HASWELL-LABEL: test_pxor:
7073; HASWELL:       # %bb.0:
7074; HASWELL-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
7075; HASWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
7076; HASWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
7077; HASWELL-NEXT:    retq # sched: [7:1.00]
7078;
7079; BROADWELL-LABEL: test_pxor:
7080; BROADWELL:       # %bb.0:
7081; BROADWELL-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
7082; BROADWELL-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
7083; BROADWELL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
7084; BROADWELL-NEXT:    retq # sched: [7:1.00]
7085;
7086; SKYLAKE-LABEL: test_pxor:
7087; SKYLAKE:       # %bb.0:
7088; SKYLAKE-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
7089; SKYLAKE-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
7090; SKYLAKE-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
7091; SKYLAKE-NEXT:    retq # sched: [7:1.00]
7092;
7093; SKX-LABEL: test_pxor:
7094; SKX:       # %bb.0:
7095; SKX-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
7096; SKX-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
7097; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
7098; SKX-NEXT:    retq # sched: [7:1.00]
7099;
7100; ZNVER1-LABEL: test_pxor:
7101; ZNVER1:       # %bb.0:
7102; ZNVER1-NEXT:    vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
7103; ZNVER1-NEXT:    vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
7104; ZNVER1-NEXT:    vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
7105; ZNVER1-NEXT:    retq # sched: [1:0.50]
7106  %1 = xor <4 x i64> %a0, %a1
7107  %2 = load <4 x i64>, <4 x i64> *%a2, align 32
7108  %3 = xor <4 x i64> %1, %2
7109  %4 = add <4 x i64> %3, %a1
7110  ret <4 x i64> %4
7111}
7112
7113!0 = !{i32 1}
7114