• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
4
5; This test is an assembly of avx512 shuffling instructions to check their scheduling
6
7define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
8; GENERIC-LABEL: test_16xi16_perm_mask0:
9; GENERIC:       # %bb.0:
10; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
11; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
12; GENERIC-NEXT:    retq # sched: [1:1.00]
13;
14; SKX-LABEL: test_16xi16_perm_mask0:
15; SKX:       # %bb.0:
16; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
17; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
18; SKX-NEXT:    retq # sched: [7:1.00]
19  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
20  ret <16 x i16> %res
21}
22define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
23; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
24; GENERIC:       # %bb.0:
25; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
26; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
27; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
28; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
29; GENERIC-NEXT:    retq # sched: [1:1.00]
30;
31; SKX-LABEL: test_masked_16xi16_perm_mask0:
32; SKX:       # %bb.0:
33; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
34; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
35; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
36; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
37; SKX-NEXT:    retq # sched: [7:1.00]
38  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
39  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
40  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
41  ret <16 x i16> %res
42}
43
44define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
45; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
46; GENERIC:       # %bb.0:
47; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
48; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
49; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
50; GENERIC-NEXT:    retq # sched: [1:1.00]
51;
52; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
53; SKX:       # %bb.0:
54; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
55; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
56; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
57; SKX-NEXT:    retq # sched: [7:1.00]
58  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
59  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
60  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
61  ret <16 x i16> %res
62}
63define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
64; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
65; GENERIC:       # %bb.0:
66; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
67; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
68; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
69; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
70; GENERIC-NEXT:    retq # sched: [1:1.00]
71;
72; SKX-LABEL: test_masked_16xi16_perm_mask1:
73; SKX:       # %bb.0:
74; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
75; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
76; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
77; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
78; SKX-NEXT:    retq # sched: [7:1.00]
79  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
80  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
81  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
82  ret <16 x i16> %res
83}
84
85define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
86; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
87; GENERIC:       # %bb.0:
88; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
89; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
90; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
91; GENERIC-NEXT:    retq # sched: [1:1.00]
92;
93; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
94; SKX:       # %bb.0:
95; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
96; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
97; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
98; SKX-NEXT:    retq # sched: [7:1.00]
99  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
100  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
101  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
102  ret <16 x i16> %res
103}
104define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
105; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
106; GENERIC:       # %bb.0:
107; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
108; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
109; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
110; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
111; GENERIC-NEXT:    retq # sched: [1:1.00]
112;
113; SKX-LABEL: test_masked_16xi16_perm_mask2:
114; SKX:       # %bb.0:
115; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
116; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
117; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
118; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
119; SKX-NEXT:    retq # sched: [7:1.00]
120  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
121  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
122  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
123  ret <16 x i16> %res
124}
125
126define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
127; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
128; GENERIC:       # %bb.0:
129; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
130; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
131; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
132; GENERIC-NEXT:    retq # sched: [1:1.00]
133;
134; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
135; SKX:       # %bb.0:
136; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
137; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
138; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
139; SKX-NEXT:    retq # sched: [7:1.00]
140  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
141  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
142  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
143  ret <16 x i16> %res
144}
145define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
146; GENERIC-LABEL: test_16xi16_perm_mask3:
147; GENERIC:       # %bb.0:
148; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
149; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
150; GENERIC-NEXT:    retq # sched: [1:1.00]
151;
152; SKX-LABEL: test_16xi16_perm_mask3:
153; SKX:       # %bb.0:
154; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
155; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
156; SKX-NEXT:    retq # sched: [7:1.00]
157  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
158  ret <16 x i16> %res
159}
160define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
161; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
162; GENERIC:       # %bb.0:
163; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
164; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
165; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
166; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
167; GENERIC-NEXT:    retq # sched: [1:1.00]
168;
169; SKX-LABEL: test_masked_16xi16_perm_mask3:
170; SKX:       # %bb.0:
171; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
172; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
173; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
174; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
175; SKX-NEXT:    retq # sched: [7:1.00]
176  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
177  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
178  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
179  ret <16 x i16> %res
180}
181
182define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
183; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
184; GENERIC:       # %bb.0:
185; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
186; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
187; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
188; GENERIC-NEXT:    retq # sched: [1:1.00]
189;
190; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
191; SKX:       # %bb.0:
192; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
193; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
194; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
195; SKX-NEXT:    retq # sched: [7:1.00]
196  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
197  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
198  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
199  ret <16 x i16> %res
200}
201define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
202; GENERIC-LABEL: test_16xi16_perm_mem_mask0:
203; GENERIC:       # %bb.0:
204; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
205; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
206; GENERIC-NEXT:    retq # sched: [1:1.00]
207;
208; SKX-LABEL: test_16xi16_perm_mem_mask0:
209; SKX:       # %bb.0:
210; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
211; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
212; SKX-NEXT:    retq # sched: [7:1.00]
213  %vec = load <16 x i16>, <16 x i16>* %vp
214  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
215  ret <16 x i16> %res
216}
217define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
218; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
219; GENERIC:       # %bb.0:
220; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
221; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
222; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
223; GENERIC-NEXT:    retq # sched: [1:1.00]
224;
225; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
226; SKX:       # %bb.0:
227; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
228; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
229; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
230; SKX-NEXT:    retq # sched: [7:1.00]
231  %vec = load <16 x i16>, <16 x i16>* %vp
232  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
233  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
234  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
235  ret <16 x i16> %res
236}
237
238define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
239; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
240; GENERIC:       # %bb.0:
241; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
242; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
243; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
244; GENERIC-NEXT:    retq # sched: [1:1.00]
245;
246; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
247; SKX:       # %bb.0:
248; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
249; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
250; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
251; SKX-NEXT:    retq # sched: [7:1.00]
252  %vec = load <16 x i16>, <16 x i16>* %vp
253  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
254  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
255  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
256  ret <16 x i16> %res
257}
258
259define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
260; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
261; GENERIC:       # %bb.0:
262; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
263; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
264; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
265; GENERIC-NEXT:    retq # sched: [1:1.00]
266;
267; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
268; SKX:       # %bb.0:
269; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
270; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
271; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
272; SKX-NEXT:    retq # sched: [7:1.00]
273  %vec = load <16 x i16>, <16 x i16>* %vp
274  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
275  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
276  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
277  ret <16 x i16> %res
278}
279
280define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
281; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
282; GENERIC:       # %bb.0:
283; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
284; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
285; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
286; GENERIC-NEXT:    retq # sched: [1:1.00]
287;
288; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
289; SKX:       # %bb.0:
290; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
291; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
292; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
293; SKX-NEXT:    retq # sched: [7:1.00]
294  %vec = load <16 x i16>, <16 x i16>* %vp
295  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
296  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
297  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
298  ret <16 x i16> %res
299}
300
301define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
302; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
303; GENERIC:       # %bb.0:
304; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
305; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
306; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
307; GENERIC-NEXT:    retq # sched: [1:1.00]
308;
309; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
310; SKX:       # %bb.0:
311; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
312; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
313; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
314; SKX-NEXT:    retq # sched: [7:1.00]
315  %vec = load <16 x i16>, <16 x i16>* %vp
316  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
317  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
318  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
319  ret <16 x i16> %res
320}
321
322define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
323; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
324; GENERIC:       # %bb.0:
325; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
326; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
327; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
328; GENERIC-NEXT:    retq # sched: [1:1.00]
329;
330; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
331; SKX:       # %bb.0:
332; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
333; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
334; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
335; SKX-NEXT:    retq # sched: [7:1.00]
336  %vec = load <16 x i16>, <16 x i16>* %vp
337  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
338  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
339  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
340  ret <16 x i16> %res
341}
342
343define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
344; GENERIC-LABEL: test_16xi16_perm_mem_mask3:
345; GENERIC:       # %bb.0:
346; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
347; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
348; GENERIC-NEXT:    retq # sched: [1:1.00]
349;
350; SKX-LABEL: test_16xi16_perm_mem_mask3:
351; SKX:       # %bb.0:
352; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
353; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
354; SKX-NEXT:    retq # sched: [7:1.00]
355  %vec = load <16 x i16>, <16 x i16>* %vp
356  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
357  ret <16 x i16> %res
358}
359define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
360; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
361; GENERIC:       # %bb.0:
362; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
363; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
364; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
365; GENERIC-NEXT:    retq # sched: [1:1.00]
366;
367; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
368; SKX:       # %bb.0:
369; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
370; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
371; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
372; SKX-NEXT:    retq # sched: [7:1.00]
373  %vec = load <16 x i16>, <16 x i16>* %vp
374  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
375  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
376  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
377  ret <16 x i16> %res
378}
379
380define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
381; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
382; GENERIC:       # %bb.0:
383; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
384; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
385; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
386; GENERIC-NEXT:    retq # sched: [1:1.00]
387;
388; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
389; SKX:       # %bb.0:
390; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
391; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
392; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
393; SKX-NEXT:    retq # sched: [7:1.00]
394  %vec = load <16 x i16>, <16 x i16>* %vp
395  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
396  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
397  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
398  ret <16 x i16> %res
399}
400
401define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
402; GENERIC-LABEL: test_32xi16_perm_mask0:
403; GENERIC:       # %bb.0:
404; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
405; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
406; GENERIC-NEXT:    retq # sched: [1:1.00]
407;
408; SKX-LABEL: test_32xi16_perm_mask0:
409; SKX:       # %bb.0:
410; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
411; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
412; SKX-NEXT:    retq # sched: [7:1.00]
413  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
414  ret <32 x i16> %res
415}
416define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
417; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
418; GENERIC:       # %bb.0:
419; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
420; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
421; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
422; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
423; GENERIC-NEXT:    retq # sched: [1:1.00]
424;
425; SKX-LABEL: test_masked_32xi16_perm_mask0:
426; SKX:       # %bb.0:
427; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
428; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
429; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
430; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
431; SKX-NEXT:    retq # sched: [7:1.00]
432  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
433  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
434  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
435  ret <32 x i16> %res
436}
437
438define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
439; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
440; GENERIC:       # %bb.0:
441; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
442; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
443; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
444; GENERIC-NEXT:    retq # sched: [1:1.00]
445;
446; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
447; SKX:       # %bb.0:
448; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
449; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
450; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
451; SKX-NEXT:    retq # sched: [7:1.00]
452  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
453  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
454  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
455  ret <32 x i16> %res
456}
457define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
458; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
459; GENERIC:       # %bb.0:
460; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
461; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
462; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
463; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
464; GENERIC-NEXT:    retq # sched: [1:1.00]
465;
466; SKX-LABEL: test_masked_32xi16_perm_mask1:
467; SKX:       # %bb.0:
468; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
469; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
470; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
471; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
472; SKX-NEXT:    retq # sched: [7:1.00]
473  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
474  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
475  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
476  ret <32 x i16> %res
477}
478
479define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
480; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
481; GENERIC:       # %bb.0:
482; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
483; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
484; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
485; GENERIC-NEXT:    retq # sched: [1:1.00]
486;
487; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
488; SKX:       # %bb.0:
489; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
490; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
491; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
492; SKX-NEXT:    retq # sched: [7:1.00]
493  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
494  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
495  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
496  ret <32 x i16> %res
497}
498define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
499; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
500; GENERIC:       # %bb.0:
501; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
502; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
503; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
504; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
505; GENERIC-NEXT:    retq # sched: [1:1.00]
506;
507; SKX-LABEL: test_masked_32xi16_perm_mask2:
508; SKX:       # %bb.0:
509; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
510; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
511; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
512; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
513; SKX-NEXT:    retq # sched: [7:1.00]
514  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
515  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
516  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
517  ret <32 x i16> %res
518}
519
520define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
521; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
522; GENERIC:       # %bb.0:
523; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
524; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
525; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
526; GENERIC-NEXT:    retq # sched: [1:1.00]
527;
528; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
529; SKX:       # %bb.0:
530; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
531; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
532; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
533; SKX-NEXT:    retq # sched: [7:1.00]
534  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
535  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
536  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
537  ret <32 x i16> %res
538}
539define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
540; GENERIC-LABEL: test_32xi16_perm_mask3:
541; GENERIC:       # %bb.0:
542; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
543; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
544; GENERIC-NEXT:    retq # sched: [1:1.00]
545;
546; SKX-LABEL: test_32xi16_perm_mask3:
547; SKX:       # %bb.0:
548; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
549; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
550; SKX-NEXT:    retq # sched: [7:1.00]
551  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
552  ret <32 x i16> %res
553}
554define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
555; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
556; GENERIC:       # %bb.0:
557; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
558; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
559; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
560; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
561; GENERIC-NEXT:    retq # sched: [1:1.00]
562;
563; SKX-LABEL: test_masked_32xi16_perm_mask3:
564; SKX:       # %bb.0:
565; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
566; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
567; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
568; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
569; SKX-NEXT:    retq # sched: [7:1.00]
570  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
571  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
572  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
573  ret <32 x i16> %res
574}
575
576define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
577; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
578; GENERIC:       # %bb.0:
579; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
580; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
581; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
582; GENERIC-NEXT:    retq # sched: [1:1.00]
583;
584; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
585; SKX:       # %bb.0:
586; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
587; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
588; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
589; SKX-NEXT:    retq # sched: [7:1.00]
590  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
591  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
592  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
593  ret <32 x i16> %res
594}
595define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
596; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
597; GENERIC:       # %bb.0:
598; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
599; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
600; GENERIC-NEXT:    retq # sched: [1:1.00]
601;
602; SKX-LABEL: test_32xi16_perm_mem_mask0:
603; SKX:       # %bb.0:
604; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
605; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
606; SKX-NEXT:    retq # sched: [7:1.00]
607  %vec = load <32 x i16>, <32 x i16>* %vp
608  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
609  ret <32 x i16> %res
610}
611define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
612; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
613; GENERIC:       # %bb.0:
614; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
615; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
616; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
617; GENERIC-NEXT:    retq # sched: [1:1.00]
618;
619; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
620; SKX:       # %bb.0:
621; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
622; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
623; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
624; SKX-NEXT:    retq # sched: [7:1.00]
625  %vec = load <32 x i16>, <32 x i16>* %vp
626  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
627  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
628  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
629  ret <32 x i16> %res
630}
631
632define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
633; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
634; GENERIC:       # %bb.0:
635; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
636; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
637; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
638; GENERIC-NEXT:    retq # sched: [1:1.00]
639;
640; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
641; SKX:       # %bb.0:
642; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
643; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
644; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
645; SKX-NEXT:    retq # sched: [7:1.00]
646  %vec = load <32 x i16>, <32 x i16>* %vp
647  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
648  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
649  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
650  ret <32 x i16> %res
651}
652
653define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
654; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
655; GENERIC:       # %bb.0:
656; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
657; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
658; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
659; GENERIC-NEXT:    retq # sched: [1:1.00]
660;
661; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
662; SKX:       # %bb.0:
663; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
664; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
665; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
666; SKX-NEXT:    retq # sched: [7:1.00]
667  %vec = load <32 x i16>, <32 x i16>* %vp
668  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
669  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
670  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
671  ret <32 x i16> %res
672}
673
674define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
675; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
676; GENERIC:       # %bb.0:
677; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
678; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
679; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
680; GENERIC-NEXT:    retq # sched: [1:1.00]
681;
682; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
683; SKX:       # %bb.0:
684; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
685; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
686; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
687; SKX-NEXT:    retq # sched: [7:1.00]
688  %vec = load <32 x i16>, <32 x i16>* %vp
689  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
690  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
691  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
692  ret <32 x i16> %res
693}
694
695define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
696; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
697; GENERIC:       # %bb.0:
698; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
699; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
700; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
701; GENERIC-NEXT:    retq # sched: [1:1.00]
702;
703; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
704; SKX:       # %bb.0:
705; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
706; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
707; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
708; SKX-NEXT:    retq # sched: [7:1.00]
709  %vec = load <32 x i16>, <32 x i16>* %vp
710  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
711  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
712  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
713  ret <32 x i16> %res
714}
715
716define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
717; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
718; GENERIC:       # %bb.0:
719; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
720; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
721; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
722; GENERIC-NEXT:    retq # sched: [1:1.00]
723;
724; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
725; SKX:       # %bb.0:
726; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
727; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
728; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
729; SKX-NEXT:    retq # sched: [7:1.00]
730  %vec = load <32 x i16>, <32 x i16>* %vp
731  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
732  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
733  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
734  ret <32 x i16> %res
735}
736
737define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
738; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
739; GENERIC:       # %bb.0:
740; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
741; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
742; GENERIC-NEXT:    retq # sched: [1:1.00]
743;
744; SKX-LABEL: test_32xi16_perm_mem_mask3:
745; SKX:       # %bb.0:
746; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
747; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
748; SKX-NEXT:    retq # sched: [7:1.00]
749  %vec = load <32 x i16>, <32 x i16>* %vp
750  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
751  ret <32 x i16> %res
752}
753define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
754; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
755; GENERIC:       # %bb.0:
756; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
757; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
758; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
759; GENERIC-NEXT:    retq # sched: [1:1.00]
760;
761; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
762; SKX:       # %bb.0:
763; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
764; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
765; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
766; SKX-NEXT:    retq # sched: [7:1.00]
767  %vec = load <32 x i16>, <32 x i16>* %vp
768  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
769  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
770  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
771  ret <32 x i16> %res
772}
773
774define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
775; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
776; GENERIC:       # %bb.0:
777; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
778; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
779; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
780; GENERIC-NEXT:    retq # sched: [1:1.00]
781;
782; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
783; SKX:       # %bb.0:
784; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
785; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
786; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
787; SKX-NEXT:    retq # sched: [7:1.00]
788  %vec = load <32 x i16>, <32 x i16>* %vp
789  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
790  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
791  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
792  ret <32 x i16> %res
793}
794
795define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
796; GENERIC-LABEL: test_8xi32_perm_mask0:
797; GENERIC:       # %bb.0:
798; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
799; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
800; GENERIC-NEXT:    retq # sched: [1:1.00]
801;
802; SKX-LABEL: test_8xi32_perm_mask0:
803; SKX:       # %bb.0:
804; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
805; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
806; SKX-NEXT:    retq # sched: [7:1.00]
807  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
808  ret <8 x i32> %res
809}
810define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
811; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
812; GENERIC:       # %bb.0:
813; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
814; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
815; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
816; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
817; GENERIC-NEXT:    retq # sched: [1:1.00]
818;
819; SKX-LABEL: test_masked_8xi32_perm_mask0:
820; SKX:       # %bb.0:
821; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
822; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
823; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
824; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
825; SKX-NEXT:    retq # sched: [7:1.00]
826  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
827  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
828  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
829  ret <8 x i32> %res
830}
831
832define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
833; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
834; GENERIC:       # %bb.0:
835; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
836; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
837; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
838; GENERIC-NEXT:    retq # sched: [1:1.00]
839;
840; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
841; SKX:       # %bb.0:
842; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
843; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
844; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
845; SKX-NEXT:    retq # sched: [7:1.00]
846  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
847  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
848  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
849  ret <8 x i32> %res
850}
851define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
852; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
853; GENERIC:       # %bb.0:
854; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
855; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
856; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
857; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
858; GENERIC-NEXT:    retq # sched: [1:1.00]
859;
860; SKX-LABEL: test_masked_8xi32_perm_mask1:
861; SKX:       # %bb.0:
862; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
863; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
864; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
865; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
866; SKX-NEXT:    retq # sched: [7:1.00]
867  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
868  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
869  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
870  ret <8 x i32> %res
871}
872
873define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
874; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
875; GENERIC:       # %bb.0:
876; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
877; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
878; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
879; GENERIC-NEXT:    retq # sched: [1:1.00]
880;
881; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
882; SKX:       # %bb.0:
883; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
884; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
885; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
886; SKX-NEXT:    retq # sched: [7:1.00]
887  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
888  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
889  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
890  ret <8 x i32> %res
891}
892define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
893; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
894; GENERIC:       # %bb.0:
895; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
896; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
897; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
898; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
899; GENERIC-NEXT:    retq # sched: [1:1.00]
900;
901; SKX-LABEL: test_masked_8xi32_perm_mask2:
902; SKX:       # %bb.0:
903; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
904; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
905; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
906; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
907; SKX-NEXT:    retq # sched: [7:1.00]
908  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
909  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
910  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
911  ret <8 x i32> %res
912}
913
914define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
915; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
916; GENERIC:       # %bb.0:
917; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
918; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
919; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
920; GENERIC-NEXT:    retq # sched: [1:1.00]
921;
922; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
923; SKX:       # %bb.0:
924; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
925; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
926; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
927; SKX-NEXT:    retq # sched: [7:1.00]
928  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
929  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
930  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
931  ret <8 x i32> %res
932}
933define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
934; GENERIC-LABEL: test_8xi32_perm_mask3:
935; GENERIC:       # %bb.0:
936; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
937; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
938; GENERIC-NEXT:    retq # sched: [1:1.00]
939;
940; SKX-LABEL: test_8xi32_perm_mask3:
941; SKX:       # %bb.0:
942; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
943; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
944; SKX-NEXT:    retq # sched: [7:1.00]
945  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
946  ret <8 x i32> %res
947}
948define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
949; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
950; GENERIC:       # %bb.0:
951; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
952; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
953; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
954; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
955; GENERIC-NEXT:    retq # sched: [1:1.00]
956;
957; SKX-LABEL: test_masked_8xi32_perm_mask3:
958; SKX:       # %bb.0:
959; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
960; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
961; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
962; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
963; SKX-NEXT:    retq # sched: [7:1.00]
964  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
965  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
966  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
967  ret <8 x i32> %res
968}
969
970define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
971; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
972; GENERIC:       # %bb.0:
973; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
974; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
975; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
976; GENERIC-NEXT:    retq # sched: [1:1.00]
977;
978; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
979; SKX:       # %bb.0:
980; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
981; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
982; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
983; SKX-NEXT:    retq # sched: [7:1.00]
984  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
985  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
986  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
987  ret <8 x i32> %res
988}
989define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
990; GENERIC-LABEL: test_8xi32_perm_mem_mask0:
991; GENERIC:       # %bb.0:
992; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
993; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
994; GENERIC-NEXT:    retq # sched: [1:1.00]
995;
996; SKX-LABEL: test_8xi32_perm_mem_mask0:
997; SKX:       # %bb.0:
998; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
999; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
1000; SKX-NEXT:    retq # sched: [7:1.00]
1001  %vec = load <8 x i32>, <8 x i32>* %vp
1002  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
1003  ret <8 x i32> %res
1004}
1005define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1006; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
1007; GENERIC:       # %bb.0:
1008; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
1009; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
1010; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
1011; GENERIC-NEXT:    retq # sched: [1:1.00]
1012;
1013; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
1014; SKX:       # %bb.0:
1015; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
1016; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
1017; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
1018; SKX-NEXT:    retq # sched: [7:1.00]
1019  %vec = load <8 x i32>, <8 x i32>* %vp
1020  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
1021  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1022  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1023  ret <8 x i32> %res
1024}
1025
1026define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
1027; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
1028; GENERIC:       # %bb.0:
1029; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
1030; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
1031; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
1032; GENERIC-NEXT:    retq # sched: [1:1.00]
1033;
1034; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
1035; SKX:       # %bb.0:
1036; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
1037; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
1038; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
1039; SKX-NEXT:    retq # sched: [7:1.00]
1040  %vec = load <8 x i32>, <8 x i32>* %vp
1041  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
1042  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1043  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1044  ret <8 x i32> %res
1045}
1046
1047define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1048; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
1049; GENERIC:       # %bb.0:
1050; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
1051; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
1052; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
1053; GENERIC-NEXT:    retq # sched: [1:1.00]
1054;
1055; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
1056; SKX:       # %bb.0:
1057; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
1058; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
1059; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
1060; SKX-NEXT:    retq # sched: [7:1.00]
1061  %vec = load <8 x i32>, <8 x i32>* %vp
1062  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
1063  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1064  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1065  ret <8 x i32> %res
1066}
1067
1068define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
1069; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
1070; GENERIC:       # %bb.0:
1071; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
1072; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
1073; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
1074; GENERIC-NEXT:    retq # sched: [1:1.00]
1075;
1076; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
1077; SKX:       # %bb.0:
1078; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
1079; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
1080; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
1081; SKX-NEXT:    retq # sched: [7:1.00]
1082  %vec = load <8 x i32>, <8 x i32>* %vp
1083  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
1084  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1085  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1086  ret <8 x i32> %res
1087}
1088
1089define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1090; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
1091; GENERIC:       # %bb.0:
1092; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
1093; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
1094; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
1095; GENERIC-NEXT:    retq # sched: [1:1.00]
1096;
1097; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
1098; SKX:       # %bb.0:
1099; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
1100; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
1101; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
1102; SKX-NEXT:    retq # sched: [7:1.00]
1103  %vec = load <8 x i32>, <8 x i32>* %vp
1104  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
1105  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1106  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1107  ret <8 x i32> %res
1108}
1109
1110define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
1111; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
1112; GENERIC:       # %bb.0:
1113; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
1114; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
1115; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
1116; GENERIC-NEXT:    retq # sched: [1:1.00]
1117;
1118; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
1119; SKX:       # %bb.0:
1120; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
1121; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
1122; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
1123; SKX-NEXT:    retq # sched: [7:1.00]
1124  %vec = load <8 x i32>, <8 x i32>* %vp
1125  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
1126  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1127  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1128  ret <8 x i32> %res
1129}
1130
1131define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
1132; GENERIC-LABEL: test_8xi32_perm_mem_mask3:
1133; GENERIC:       # %bb.0:
1134; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
1135; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
1136; GENERIC-NEXT:    retq # sched: [1:1.00]
1137;
1138; SKX-LABEL: test_8xi32_perm_mem_mask3:
1139; SKX:       # %bb.0:
1140; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
1141; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
1142; SKX-NEXT:    retq # sched: [7:1.00]
1143  %vec = load <8 x i32>, <8 x i32>* %vp
1144  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
1145  ret <8 x i32> %res
1146}
1147define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
1148; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
1149; GENERIC:       # %bb.0:
1150; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
1151; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
1152; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
1153; GENERIC-NEXT:    retq # sched: [1:1.00]
1154;
1155; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
1156; SKX:       # %bb.0:
1157; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
1158; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
1159; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
1160; SKX-NEXT:    retq # sched: [7:1.00]
1161  %vec = load <8 x i32>, <8 x i32>* %vp
1162  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
1163  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1164  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
1165  ret <8 x i32> %res
1166}
1167
1168define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
1169; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
1170; GENERIC:       # %bb.0:
1171; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
1172; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
1173; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
1174; GENERIC-NEXT:    retq # sched: [1:1.00]
1175;
1176; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
1177; SKX:       # %bb.0:
1178; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
1179; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
1180; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
1181; SKX-NEXT:    retq # sched: [7:1.00]
1182  %vec = load <8 x i32>, <8 x i32>* %vp
1183  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
1184  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
1185  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
1186  ret <8 x i32> %res
1187}
1188
1189define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
1190; GENERIC-LABEL: test_16xi32_perm_mask0:
1191; GENERIC:       # %bb.0:
1192; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
1193; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
1194; GENERIC-NEXT:    retq # sched: [1:1.00]
1195;
1196; SKX-LABEL: test_16xi32_perm_mask0:
1197; SKX:       # %bb.0:
1198; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
1199; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
1200; SKX-NEXT:    retq # sched: [7:1.00]
1201  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
1202  ret <16 x i32> %res
1203}
1204define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
1205; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
1206; GENERIC:       # %bb.0:
1207; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
1208; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
1209; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
1210; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
1211; GENERIC-NEXT:    retq # sched: [1:1.00]
1212;
1213; SKX-LABEL: test_masked_16xi32_perm_mask0:
1214; SKX:       # %bb.0:
1215; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
1216; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
1217; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
1218; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
1219; SKX-NEXT:    retq # sched: [7:1.00]
1220  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
1221  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1222  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1223  ret <16 x i32> %res
1224}
1225
1226define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
1227; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
1228; GENERIC:       # %bb.0:
1229; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
1230; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1231; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
1232; GENERIC-NEXT:    retq # sched: [1:1.00]
1233;
1234; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
1235; SKX:       # %bb.0:
1236; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
1237; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1238; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
1239; SKX-NEXT:    retq # sched: [7:1.00]
1240  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
1241  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1242  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1243  ret <16 x i32> %res
1244}
1245define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
1246; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
1247; GENERIC:       # %bb.0:
1248; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
1249; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
1250; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
1251; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
1252; GENERIC-NEXT:    retq # sched: [1:1.00]
1253;
1254; SKX-LABEL: test_masked_16xi32_perm_mask1:
1255; SKX:       # %bb.0:
1256; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
1257; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
1258; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
1259; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
1260; SKX-NEXT:    retq # sched: [7:1.00]
1261  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
1262  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1263  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1264  ret <16 x i32> %res
1265}
1266
1267define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
1268; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
1269; GENERIC:       # %bb.0:
1270; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
1271; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1272; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
1273; GENERIC-NEXT:    retq # sched: [1:1.00]
1274;
1275; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
1276; SKX:       # %bb.0:
1277; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
1278; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1279; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
1280; SKX-NEXT:    retq # sched: [7:1.00]
1281  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
1282  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1283  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1284  ret <16 x i32> %res
1285}
1286define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
1287; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
1288; GENERIC:       # %bb.0:
1289; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
1290; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
1291; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
1292; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
1293; GENERIC-NEXT:    retq # sched: [1:1.00]
1294;
1295; SKX-LABEL: test_masked_16xi32_perm_mask2:
1296; SKX:       # %bb.0:
1297; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
1298; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
1299; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
1300; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
1301; SKX-NEXT:    retq # sched: [7:1.00]
1302  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
1303  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1304  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1305  ret <16 x i32> %res
1306}
1307
1308define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
1309; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
1310; GENERIC:       # %bb.0:
1311; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
1312; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1313; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
1314; GENERIC-NEXT:    retq # sched: [1:1.00]
1315;
1316; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
1317; SKX:       # %bb.0:
1318; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
1319; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1320; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
1321; SKX-NEXT:    retq # sched: [7:1.00]
1322  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
1323  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1324  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1325  ret <16 x i32> %res
1326}
1327define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
1328; GENERIC-LABEL: test_16xi32_perm_mask3:
1329; GENERIC:       # %bb.0:
1330; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
1331; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
1332; GENERIC-NEXT:    retq # sched: [1:1.00]
1333;
1334; SKX-LABEL: test_16xi32_perm_mask3:
1335; SKX:       # %bb.0:
1336; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
1337; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
1338; SKX-NEXT:    retq # sched: [7:1.00]
1339  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
1340  ret <16 x i32> %res
1341}
1342define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
1343; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
1344; GENERIC:       # %bb.0:
1345; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
1346; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
1347; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
1348; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
1349; GENERIC-NEXT:    retq # sched: [1:1.00]
1350;
1351; SKX-LABEL: test_masked_16xi32_perm_mask3:
1352; SKX:       # %bb.0:
1353; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
1354; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
1355; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
1356; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
1357; SKX-NEXT:    retq # sched: [7:1.00]
1358  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
1359  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1360  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1361  ret <16 x i32> %res
1362}
1363
1364define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
1365; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
1366; GENERIC:       # %bb.0:
1367; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
1368; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1369; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
1370; GENERIC-NEXT:    retq # sched: [1:1.00]
1371;
1372; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
1373; SKX:       # %bb.0:
1374; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
1375; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1376; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
1377; SKX-NEXT:    retq # sched: [7:1.00]
1378  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
1379  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1380  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1381  ret <16 x i32> %res
1382}
1383define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
1384; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
1385; GENERIC:       # %bb.0:
1386; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
1387; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
1388; GENERIC-NEXT:    retq # sched: [1:1.00]
1389;
1390; SKX-LABEL: test_16xi32_perm_mem_mask0:
1391; SKX:       # %bb.0:
1392; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
1393; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
1394; SKX-NEXT:    retq # sched: [7:1.00]
1395  %vec = load <16 x i32>, <16 x i32>* %vp
1396  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
1397  ret <16 x i32> %res
1398}
1399define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
1400; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
1401; GENERIC:       # %bb.0:
1402; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
1403; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1404; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
1405; GENERIC-NEXT:    retq # sched: [1:1.00]
1406;
1407; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
1408; SKX:       # %bb.0:
1409; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
1410; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1411; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
1412; SKX-NEXT:    retq # sched: [7:1.00]
1413  %vec = load <16 x i32>, <16 x i32>* %vp
1414  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
1415  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1416  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1417  ret <16 x i32> %res
1418}
1419
1420define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
1421; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
1422; GENERIC:       # %bb.0:
1423; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
1424; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
1425; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
1426; GENERIC-NEXT:    retq # sched: [1:1.00]
1427;
1428; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
1429; SKX:       # %bb.0:
1430; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
1431; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
1432; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
1433; SKX-NEXT:    retq # sched: [7:1.00]
1434  %vec = load <16 x i32>, <16 x i32>* %vp
1435  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
1436  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1437  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1438  ret <16 x i32> %res
1439}
1440
1441define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
1442; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
1443; GENERIC:       # %bb.0:
1444; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
1445; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1446; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
1447; GENERIC-NEXT:    retq # sched: [1:1.00]
1448;
1449; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
1450; SKX:       # %bb.0:
1451; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
1452; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1453; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
1454; SKX-NEXT:    retq # sched: [7:1.00]
1455  %vec = load <16 x i32>, <16 x i32>* %vp
1456  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
1457  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1458  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1459  ret <16 x i32> %res
1460}
1461
1462define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
1463; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
1464; GENERIC:       # %bb.0:
1465; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
1466; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
1467; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
1468; GENERIC-NEXT:    retq # sched: [1:1.00]
1469;
1470; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
1471; SKX:       # %bb.0:
1472; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
1473; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
1474; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
1475; SKX-NEXT:    retq # sched: [7:1.00]
1476  %vec = load <16 x i32>, <16 x i32>* %vp
1477  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
1478  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1479  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1480  ret <16 x i32> %res
1481}
1482
1483define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
1484; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
1485; GENERIC:       # %bb.0:
1486; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
1487; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1488; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
1489; GENERIC-NEXT:    retq # sched: [1:1.00]
1490;
1491; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
1492; SKX:       # %bb.0:
1493; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
1494; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1495; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
1496; SKX-NEXT:    retq # sched: [7:1.00]
1497  %vec = load <16 x i32>, <16 x i32>* %vp
1498  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
1499  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1500  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1501  ret <16 x i32> %res
1502}
1503
1504define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
1505; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
1506; GENERIC:       # %bb.0:
1507; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
1508; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
1509; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
1510; GENERIC-NEXT:    retq # sched: [1:1.00]
1511;
1512; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
1513; SKX:       # %bb.0:
1514; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
1515; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
1516; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
1517; SKX-NEXT:    retq # sched: [7:1.00]
1518  %vec = load <16 x i32>, <16 x i32>* %vp
1519  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
1520  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1521  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1522  ret <16 x i32> %res
1523}
1524
1525define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
1526; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
1527; GENERIC:       # %bb.0:
1528; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
1529; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
1530; GENERIC-NEXT:    retq # sched: [1:1.00]
1531;
1532; SKX-LABEL: test_16xi32_perm_mem_mask3:
1533; SKX:       # %bb.0:
1534; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
1535; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
1536; SKX-NEXT:    retq # sched: [7:1.00]
1537  %vec = load <16 x i32>, <16 x i32>* %vp
1538  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
1539  ret <16 x i32> %res
1540}
1541define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
1542; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
1543; GENERIC:       # %bb.0:
1544; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
1545; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
1546; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
1547; GENERIC-NEXT:    retq # sched: [1:1.00]
1548;
1549; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
1550; SKX:       # %bb.0:
1551; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
1552; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
1553; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
1554; SKX-NEXT:    retq # sched: [7:1.00]
1555  %vec = load <16 x i32>, <16 x i32>* %vp
1556  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
1557  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1558  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
1559  ret <16 x i32> %res
1560}
1561
1562define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
1563; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
1564; GENERIC:       # %bb.0:
1565; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
1566; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
1567; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
1568; GENERIC-NEXT:    retq # sched: [1:1.00]
1569;
1570; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
1571; SKX:       # %bb.0:
1572; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
1573; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
1574; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
1575; SKX-NEXT:    retq # sched: [7:1.00]
1576  %vec = load <16 x i32>, <16 x i32>* %vp
1577  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
1578  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
1579  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
1580  ret <16 x i32> %res
1581}
1582
1583define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
1584; GENERIC-LABEL: test_4xi64_perm_mask0:
1585; GENERIC:       # %bb.0:
1586; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [1:1.00]
1587; GENERIC-NEXT:    retq # sched: [1:1.00]
1588;
1589; SKX-LABEL: test_4xi64_perm_mask0:
1590; SKX:       # %bb.0:
1591; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
1592; SKX-NEXT:    retq # sched: [7:1.00]
1593  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
1594  ret <4 x i64> %res
1595}
1596define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1597; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
1598; GENERIC:       # %bb.0:
1599; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
1600; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00]
1601; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
1602; GENERIC-NEXT:    retq # sched: [1:1.00]
1603;
1604; SKX-LABEL: test_masked_4xi64_perm_mask0:
1605; SKX:       # %bb.0:
1606; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
1607; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
1608; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
1609; SKX-NEXT:    retq # sched: [7:1.00]
1610  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
1611  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1612  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1613  ret <4 x i64> %res
1614}
1615
1616define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
1617; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
1618; GENERIC:       # %bb.0:
1619; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1620; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00]
1621; GENERIC-NEXT:    retq # sched: [1:1.00]
1622;
1623; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
1624; SKX:       # %bb.0:
1625; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1626; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
1627; SKX-NEXT:    retq # sched: [7:1.00]
1628  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
1629  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1630  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1631  ret <4 x i64> %res
1632}
1633define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1634; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
1635; GENERIC:       # %bb.0:
1636; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
1637; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00]
1638; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
1639; GENERIC-NEXT:    retq # sched: [1:1.00]
1640;
1641; SKX-LABEL: test_masked_4xi64_perm_mask1:
1642; SKX:       # %bb.0:
1643; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
1644; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
1645; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
1646; SKX-NEXT:    retq # sched: [7:1.00]
1647  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
1648  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1649  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1650  ret <4 x i64> %res
1651}
1652
1653define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
1654; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
1655; GENERIC:       # %bb.0:
1656; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1657; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00]
1658; GENERIC-NEXT:    retq # sched: [1:1.00]
1659;
1660; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
1661; SKX:       # %bb.0:
1662; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1663; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
1664; SKX-NEXT:    retq # sched: [7:1.00]
1665  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
1666  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1667  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1668  ret <4 x i64> %res
1669}
1670define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1671; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
1672; GENERIC:       # %bb.0:
1673; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
1674; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00]
1675; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
1676; GENERIC-NEXT:    retq # sched: [1:1.00]
1677;
1678; SKX-LABEL: test_masked_4xi64_perm_mask2:
1679; SKX:       # %bb.0:
1680; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
1681; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
1682; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
1683; SKX-NEXT:    retq # sched: [7:1.00]
1684  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
1685  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1686  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1687  ret <4 x i64> %res
1688}
1689
1690define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
1691; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
1692; GENERIC:       # %bb.0:
1693; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1694; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00]
1695; GENERIC-NEXT:    retq # sched: [1:1.00]
1696;
1697; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
1698; SKX:       # %bb.0:
1699; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1700; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
1701; SKX-NEXT:    retq # sched: [7:1.00]
1702  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
1703  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1704  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1705  ret <4 x i64> %res
1706}
1707define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
1708; GENERIC-LABEL: test_4xi64_perm_mask3:
1709; GENERIC:       # %bb.0:
1710; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [1:1.00]
1711; GENERIC-NEXT:    retq # sched: [1:1.00]
1712;
1713; SKX-LABEL: test_4xi64_perm_mask3:
1714; SKX:       # %bb.0:
1715; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
1716; SKX-NEXT:    retq # sched: [7:1.00]
1717  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
1718  ret <4 x i64> %res
1719}
1720define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
1721; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
1722; GENERIC:       # %bb.0:
1723; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
1724; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00]
1725; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
1726; GENERIC-NEXT:    retq # sched: [1:1.00]
1727;
1728; SKX-LABEL: test_masked_4xi64_perm_mask3:
1729; SKX:       # %bb.0:
1730; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
1731; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
1732; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
1733; SKX-NEXT:    retq # sched: [7:1.00]
1734  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
1735  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1736  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1737  ret <4 x i64> %res
1738}
1739
1740define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
1741; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
1742; GENERIC:       # %bb.0:
1743; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1744; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00]
1745; GENERIC-NEXT:    retq # sched: [1:1.00]
1746;
1747; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
1748; SKX:       # %bb.0:
1749; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1750; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
1751; SKX-NEXT:    retq # sched: [7:1.00]
1752  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
1753  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1754  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1755  ret <4 x i64> %res
1756}
1757define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
1758; GENERIC-LABEL: test_4xi64_perm_mem_mask0:
1759; GENERIC:       # %bb.0:
1760; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [8:1.00]
1761; GENERIC-NEXT:    retq # sched: [1:1.00]
1762;
1763; SKX-LABEL: test_4xi64_perm_mem_mask0:
1764; SKX:       # %bb.0:
1765; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00]
1766; SKX-NEXT:    retq # sched: [7:1.00]
1767  %vec = load <4 x i64>, <4 x i64>* %vp
1768  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
1769  ret <4 x i64> %res
1770}
1771define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1772; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
1773; GENERIC:       # %bb.0:
1774; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1775; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [8:1.00]
1776; GENERIC-NEXT:    retq # sched: [1:1.00]
1777;
1778; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
1779; SKX:       # %bb.0:
1780; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1781; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
1782; SKX-NEXT:    retq # sched: [7:1.00]
1783  %vec = load <4 x i64>, <4 x i64>* %vp
1784  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
1785  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1786  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1787  ret <4 x i64> %res
1788}
1789
1790define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
1791; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
1792; GENERIC:       # %bb.0:
1793; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
1794; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [8:1.00]
1795; GENERIC-NEXT:    retq # sched: [1:1.00]
1796;
1797; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
1798; SKX:       # %bb.0:
1799; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
1800; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
1801; SKX-NEXT:    retq # sched: [7:1.00]
1802  %vec = load <4 x i64>, <4 x i64>* %vp
1803  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
1804  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1805  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1806  ret <4 x i64> %res
1807}
1808
1809define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1810; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
1811; GENERIC:       # %bb.0:
1812; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1813; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [8:1.00]
1814; GENERIC-NEXT:    retq # sched: [1:1.00]
1815;
1816; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
1817; SKX:       # %bb.0:
1818; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1819; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
1820; SKX-NEXT:    retq # sched: [7:1.00]
1821  %vec = load <4 x i64>, <4 x i64>* %vp
1822  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
1823  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1824  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1825  ret <4 x i64> %res
1826}
1827
1828define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
1829; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
1830; GENERIC:       # %bb.0:
1831; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
1832; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [8:1.00]
1833; GENERIC-NEXT:    retq # sched: [1:1.00]
1834;
1835; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
1836; SKX:       # %bb.0:
1837; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
1838; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
1839; SKX-NEXT:    retq # sched: [7:1.00]
1840  %vec = load <4 x i64>, <4 x i64>* %vp
1841  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
1842  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1843  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1844  ret <4 x i64> %res
1845}
1846
1847define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1848; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
1849; GENERIC:       # %bb.0:
1850; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1851; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [8:1.00]
1852; GENERIC-NEXT:    retq # sched: [1:1.00]
1853;
1854; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
1855; SKX:       # %bb.0:
1856; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1857; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
1858; SKX-NEXT:    retq # sched: [7:1.00]
1859  %vec = load <4 x i64>, <4 x i64>* %vp
1860  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
1861  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1862  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1863  ret <4 x i64> %res
1864}
1865
1866define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
1867; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
1868; GENERIC:       # %bb.0:
1869; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
1870; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [8:1.00]
1871; GENERIC-NEXT:    retq # sched: [1:1.00]
1872;
1873; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
1874; SKX:       # %bb.0:
1875; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
1876; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
1877; SKX-NEXT:    retq # sched: [7:1.00]
1878  %vec = load <4 x i64>, <4 x i64>* %vp
1879  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
1880  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1881  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1882  ret <4 x i64> %res
1883}
1884
1885define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
1886; GENERIC-LABEL: test_4xi64_perm_mem_mask3:
1887; GENERIC:       # %bb.0:
1888; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [8:1.00]
1889; GENERIC-NEXT:    retq # sched: [1:1.00]
1890;
1891; SKX-LABEL: test_4xi64_perm_mem_mask3:
1892; SKX:       # %bb.0:
1893; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00]
1894; SKX-NEXT:    retq # sched: [7:1.00]
1895  %vec = load <4 x i64>, <4 x i64>* %vp
1896  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
1897  ret <4 x i64> %res
1898}
1899define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
1900; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
1901; GENERIC:       # %bb.0:
1902; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
1903; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [8:1.00]
1904; GENERIC-NEXT:    retq # sched: [1:1.00]
1905;
1906; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
1907; SKX:       # %bb.0:
1908; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
1909; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
1910; SKX-NEXT:    retq # sched: [7:1.00]
1911  %vec = load <4 x i64>, <4 x i64>* %vp
1912  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
1913  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1914  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
1915  ret <4 x i64> %res
1916}
1917
1918define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
1919; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
1920; GENERIC:       # %bb.0:
1921; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
1922; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [8:1.00]
1923; GENERIC-NEXT:    retq # sched: [1:1.00]
1924;
1925; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
1926; SKX:       # %bb.0:
1927; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
1928; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
1929; SKX-NEXT:    retq # sched: [7:1.00]
1930  %vec = load <4 x i64>, <4 x i64>* %vp
1931  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
1932  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
1933  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
1934  ret <4 x i64> %res
1935}
1936
1937define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
1938; GENERIC-LABEL: test_8xi64_perm_mask0:
1939; GENERIC:       # %bb.0:
1940; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
1941; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
1942; GENERIC-NEXT:    retq # sched: [1:1.00]
1943;
1944; SKX-LABEL: test_8xi64_perm_mask0:
1945; SKX:       # %bb.0:
1946; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
1947; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
1948; SKX-NEXT:    retq # sched: [7:1.00]
1949  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
1950  ret <8 x i64> %res
1951}
1952define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1953; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
1954; GENERIC:       # %bb.0:
1955; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
1956; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
1957; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
1958; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
1959; GENERIC-NEXT:    retq # sched: [1:1.00]
1960;
1961; SKX-LABEL: test_masked_8xi64_perm_mask0:
1962; SKX:       # %bb.0:
1963; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
1964; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
1965; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
1966; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
1967; SKX-NEXT:    retq # sched: [7:1.00]
1968  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
1969  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1970  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
1971  ret <8 x i64> %res
1972}
1973
1974define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
1975; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
1976; GENERIC:       # %bb.0:
1977; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
1978; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
1979; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
1980; GENERIC-NEXT:    retq # sched: [1:1.00]
1981;
1982; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
1983; SKX:       # %bb.0:
1984; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
1985; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
1986; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
1987; SKX-NEXT:    retq # sched: [7:1.00]
1988  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
1989  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
1990  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
1991  ret <8 x i64> %res
1992}
1993define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
1994; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
1995; GENERIC:       # %bb.0:
1996; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
1997; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
1998; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
1999; GENERIC-NEXT:    retq # sched: [1:1.00]
2000;
2001; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
2002; SKX:       # %bb.0:
2003; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
2004; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
2005; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
2006; SKX-NEXT:    retq # sched: [7:1.00]
2007  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
2008  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2009  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2010  ret <8 x i64> %res
2011}
2012
2013define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
2014; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
2015; GENERIC:       # %bb.0:
2016; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2017; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
2018; GENERIC-NEXT:    retq # sched: [1:1.00]
2019;
2020; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
2021; SKX:       # %bb.0:
2022; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2023; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
2024; SKX-NEXT:    retq # sched: [7:1.00]
2025  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
2026  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2027  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2028  ret <8 x i64> %res
2029}
2030define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
2031; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
2032; GENERIC:       # %bb.0:
2033; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
2034; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
2035; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
2036; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
2037; GENERIC-NEXT:    retq # sched: [1:1.00]
2038;
2039; SKX-LABEL: test_masked_8xi64_perm_mask2:
2040; SKX:       # %bb.0:
2041; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
2042; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
2043; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
2044; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
2045; SKX-NEXT:    retq # sched: [7:1.00]
2046  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
2047  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2048  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2049  ret <8 x i64> %res
2050}
2051
2052define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
2053; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
2054; GENERIC:       # %bb.0:
2055; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
2056; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2057; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
2058; GENERIC-NEXT:    retq # sched: [1:1.00]
2059;
2060; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
2061; SKX:       # %bb.0:
2062; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
2063; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2064; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
2065; SKX-NEXT:    retq # sched: [7:1.00]
2066  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
2067  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2068  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2069  ret <8 x i64> %res
2070}
2071define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
2072; GENERIC-LABEL: test_8xi64_perm_imm_mask3:
2073; GENERIC:       # %bb.0:
2074; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
2075; GENERIC-NEXT:    retq # sched: [1:1.00]
2076;
2077; SKX-LABEL: test_8xi64_perm_imm_mask3:
2078; SKX:       # %bb.0:
2079; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
2080; SKX-NEXT:    retq # sched: [7:1.00]
2081  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
2082  ret <8 x i64> %res
2083}
2084define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
2085; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
2086; GENERIC:       # %bb.0:
2087; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
2088; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
2089; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
2090; GENERIC-NEXT:    retq # sched: [1:1.00]
2091;
2092; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
2093; SKX:       # %bb.0:
2094; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
2095; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
2096; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
2097; SKX-NEXT:    retq # sched: [7:1.00]
2098  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
2099  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2100  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2101  ret <8 x i64> %res
2102}
2103
2104define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
2105; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
2106; GENERIC:       # %bb.0:
2107; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2108; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
2109; GENERIC-NEXT:    retq # sched: [1:1.00]
2110;
2111; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
2112; SKX:       # %bb.0:
2113; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2114; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
2115; SKX-NEXT:    retq # sched: [7:1.00]
2116  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
2117  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2118  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2119  ret <8 x i64> %res
2120}
2121define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
2122; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
2123; GENERIC:       # %bb.0:
2124; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
2125; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
2126; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
2127; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
2128; GENERIC-NEXT:    retq # sched: [1:1.00]
2129;
2130; SKX-LABEL: test_masked_8xi64_perm_mask4:
2131; SKX:       # %bb.0:
2132; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
2133; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
2134; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
2135; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
2136; SKX-NEXT:    retq # sched: [7:1.00]
2137  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
2138  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2139  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2140  ret <8 x i64> %res
2141}
2142
2143define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
2144; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
2145; GENERIC:       # %bb.0:
2146; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
2147; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2148; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
2149; GENERIC-NEXT:    retq # sched: [1:1.00]
2150;
2151; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
2152; SKX:       # %bb.0:
2153; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
2154; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2155; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
2156; SKX-NEXT:    retq # sched: [7:1.00]
2157  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
2158  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2159  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2160  ret <8 x i64> %res
2161}
2162define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
2163; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
2164; GENERIC:       # %bb.0:
2165; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
2166; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
2167; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
2168; GENERIC-NEXT:    retq # sched: [1:1.00]
2169;
2170; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
2171; SKX:       # %bb.0:
2172; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
2173; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
2174; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
2175; SKX-NEXT:    retq # sched: [7:1.00]
2176  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
2177  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2178  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2179  ret <8 x i64> %res
2180}
2181
2182define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
2183; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
2184; GENERIC:       # %bb.0:
2185; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2186; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
2187; GENERIC-NEXT:    retq # sched: [1:1.00]
2188;
2189; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
2190; SKX:       # %bb.0:
2191; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2192; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
2193; SKX-NEXT:    retq # sched: [7:1.00]
2194  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
2195  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2196  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2197  ret <8 x i64> %res
2198}
2199define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
2200; GENERIC-LABEL: test_8xi64_perm_mask6:
2201; GENERIC:       # %bb.0:
2202; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
2203; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
2204; GENERIC-NEXT:    retq # sched: [1:1.00]
2205;
2206; SKX-LABEL: test_8xi64_perm_mask6:
2207; SKX:       # %bb.0:
2208; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
2209; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
2210; SKX-NEXT:    retq # sched: [7:1.00]
2211  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
2212  ret <8 x i64> %res
2213}
2214define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
2215; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
2216; GENERIC:       # %bb.0:
2217; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
2218; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
2219; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
2220; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
2221; GENERIC-NEXT:    retq # sched: [1:1.00]
2222;
2223; SKX-LABEL: test_masked_8xi64_perm_mask6:
2224; SKX:       # %bb.0:
2225; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
2226; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
2227; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
2228; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
2229; SKX-NEXT:    retq # sched: [7:1.00]
2230  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
2231  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2232  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2233  ret <8 x i64> %res
2234}
2235
2236define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
2237; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
2238; GENERIC:       # %bb.0:
2239; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
2240; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2241; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
2242; GENERIC-NEXT:    retq # sched: [1:1.00]
2243;
2244; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
2245; SKX:       # %bb.0:
2246; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
2247; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2248; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
2249; SKX-NEXT:    retq # sched: [7:1.00]
2250  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
2251  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2252  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2253  ret <8 x i64> %res
2254}
2255define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
2256; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
2257; GENERIC:       # %bb.0:
2258; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
2259; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
2260; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
2261; GENERIC-NEXT:    retq # sched: [1:1.00]
2262;
2263; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
2264; SKX:       # %bb.0:
2265; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
2266; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
2267; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
2268; SKX-NEXT:    retq # sched: [7:1.00]
2269  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
2270  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2271  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2272  ret <8 x i64> %res
2273}
2274
2275define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
2276; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
2277; GENERIC:       # %bb.0:
2278; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2279; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
2280; GENERIC-NEXT:    retq # sched: [1:1.00]
2281;
2282; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
2283; SKX:       # %bb.0:
2284; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2285; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
2286; SKX-NEXT:    retq # sched: [7:1.00]
2287  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
2288  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2289  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2290  ret <8 x i64> %res
2291}
2292define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
2293; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
2294; GENERIC:       # %bb.0:
2295; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
2296; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
2297; GENERIC-NEXT:    retq # sched: [1:1.00]
2298;
2299; SKX-LABEL: test_8xi64_perm_mem_mask0:
2300; SKX:       # %bb.0:
2301; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
2302; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
2303; SKX-NEXT:    retq # sched: [7:1.00]
2304  %vec = load <8 x i64>, <8 x i64>* %vp
2305  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
2306  ret <8 x i64> %res
2307}
2308define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2309; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
2310; GENERIC:       # %bb.0:
2311; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
2312; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2313; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
2314; GENERIC-NEXT:    retq # sched: [1:1.00]
2315;
2316; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
2317; SKX:       # %bb.0:
2318; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
2319; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2320; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
2321; SKX-NEXT:    retq # sched: [7:1.00]
2322  %vec = load <8 x i64>, <8 x i64>* %vp
2323  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
2324  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2325  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2326  ret <8 x i64> %res
2327}
2328
2329define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
2330; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
2331; GENERIC:       # %bb.0:
2332; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
2333; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2334; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
2335; GENERIC-NEXT:    retq # sched: [1:1.00]
2336;
2337; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
2338; SKX:       # %bb.0:
2339; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
2340; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2341; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
2342; SKX-NEXT:    retq # sched: [7:1.00]
2343  %vec = load <8 x i64>, <8 x i64>* %vp
2344  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
2345  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2346  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2347  ret <8 x i64> %res
2348}
2349
2350define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2351; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
2352; GENERIC:       # %bb.0:
2353; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2354; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
2355; GENERIC-NEXT:    retq # sched: [1:1.00]
2356;
2357; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
2358; SKX:       # %bb.0:
2359; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2360; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
2361; SKX-NEXT:    retq # sched: [7:1.00]
2362  %vec = load <8 x i64>, <8 x i64>* %vp
2363  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
2364  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2365  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2366  ret <8 x i64> %res
2367}
2368
2369define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
2370; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
2371; GENERIC:       # %bb.0:
2372; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2373; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
2374; GENERIC-NEXT:    retq # sched: [1:1.00]
2375;
2376; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
2377; SKX:       # %bb.0:
2378; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2379; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
2380; SKX-NEXT:    retq # sched: [7:1.00]
2381  %vec = load <8 x i64>, <8 x i64>* %vp
2382  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
2383  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2384  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2385  ret <8 x i64> %res
2386}
2387
2388define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2389; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
2390; GENERIC:       # %bb.0:
2391; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
2392; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2393; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
2394; GENERIC-NEXT:    retq # sched: [1:1.00]
2395;
2396; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
2397; SKX:       # %bb.0:
2398; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
2399; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2400; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
2401; SKX-NEXT:    retq # sched: [7:1.00]
2402  %vec = load <8 x i64>, <8 x i64>* %vp
2403  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
2404  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2405  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2406  ret <8 x i64> %res
2407}
2408
2409define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
2410; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
2411; GENERIC:       # %bb.0:
2412; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
2413; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2414; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
2415; GENERIC-NEXT:    retq # sched: [1:1.00]
2416;
2417; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
2418; SKX:       # %bb.0:
2419; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
2420; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2421; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
2422; SKX-NEXT:    retq # sched: [7:1.00]
2423  %vec = load <8 x i64>, <8 x i64>* %vp
2424  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
2425  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2426  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2427  ret <8 x i64> %res
2428}
2429
2430define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
2431; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3:
2432; GENERIC:       # %bb.0:
2433; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
2434; GENERIC-NEXT:    retq # sched: [1:1.00]
2435;
2436; SKX-LABEL: test_8xi64_perm_imm_mem_mask3:
2437; SKX:       # %bb.0:
2438; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
2439; SKX-NEXT:    retq # sched: [7:1.00]
2440  %vec = load <8 x i64>, <8 x i64>* %vp
2441  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
2442  ret <8 x i64> %res
2443}
2444define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2445; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
2446; GENERIC:       # %bb.0:
2447; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2448; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
2449; GENERIC-NEXT:    retq # sched: [1:1.00]
2450;
2451; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
2452; SKX:       # %bb.0:
2453; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2454; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
2455; SKX-NEXT:    retq # sched: [7:1.00]
2456  %vec = load <8 x i64>, <8 x i64>* %vp
2457  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
2458  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2459  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2460  ret <8 x i64> %res
2461}
2462
2463define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
2464; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
2465; GENERIC:       # %bb.0:
2466; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2467; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
2468; GENERIC-NEXT:    retq # sched: [1:1.00]
2469;
2470; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
2471; SKX:       # %bb.0:
2472; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2473; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
2474; SKX-NEXT:    retq # sched: [7:1.00]
2475  %vec = load <8 x i64>, <8 x i64>* %vp
2476  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
2477  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2478  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2479  ret <8 x i64> %res
2480}
2481
2482define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2483; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
2484; GENERIC:       # %bb.0:
2485; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
2486; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2487; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
2488; GENERIC-NEXT:    retq # sched: [1:1.00]
2489;
2490; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
2491; SKX:       # %bb.0:
2492; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
2493; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2494; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
2495; SKX-NEXT:    retq # sched: [7:1.00]
2496  %vec = load <8 x i64>, <8 x i64>* %vp
2497  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
2498  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2499  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2500  ret <8 x i64> %res
2501}
2502
2503define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
2504; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
2505; GENERIC:       # %bb.0:
2506; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
2507; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2508; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
2509; GENERIC-NEXT:    retq # sched: [1:1.00]
2510;
2511; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
2512; SKX:       # %bb.0:
2513; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
2514; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2515; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
2516; SKX-NEXT:    retq # sched: [7:1.00]
2517  %vec = load <8 x i64>, <8 x i64>* %vp
2518  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
2519  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2520  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2521  ret <8 x i64> %res
2522}
2523
2524define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2525; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
2526; GENERIC:       # %bb.0:
2527; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2528; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
2529; GENERIC-NEXT:    retq # sched: [1:1.00]
2530;
2531; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
2532; SKX:       # %bb.0:
2533; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2534; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
2535; SKX-NEXT:    retq # sched: [7:1.00]
2536  %vec = load <8 x i64>, <8 x i64>* %vp
2537  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
2538  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2539  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2540  ret <8 x i64> %res
2541}
2542
2543define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
2544; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
2545; GENERIC:       # %bb.0:
2546; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2547; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
2548; GENERIC-NEXT:    retq # sched: [1:1.00]
2549;
2550; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
2551; SKX:       # %bb.0:
2552; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2553; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
2554; SKX-NEXT:    retq # sched: [7:1.00]
2555  %vec = load <8 x i64>, <8 x i64>* %vp
2556  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
2557  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2558  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2559  ret <8 x i64> %res
2560}
2561
2562define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
2563; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
2564; GENERIC:       # %bb.0:
2565; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
2566; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
2567; GENERIC-NEXT:    retq # sched: [1:1.00]
2568;
2569; SKX-LABEL: test_8xi64_perm_mem_mask6:
2570; SKX:       # %bb.0:
2571; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
2572; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
2573; SKX-NEXT:    retq # sched: [7:1.00]
2574  %vec = load <8 x i64>, <8 x i64>* %vp
2575  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
2576  ret <8 x i64> %res
2577}
2578define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2579; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
2580; GENERIC:       # %bb.0:
2581; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
2582; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2583; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
2584; GENERIC-NEXT:    retq # sched: [1:1.00]
2585;
2586; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
2587; SKX:       # %bb.0:
2588; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
2589; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2590; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
2591; SKX-NEXT:    retq # sched: [7:1.00]
2592  %vec = load <8 x i64>, <8 x i64>* %vp
2593  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
2594  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2595  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2596  ret <8 x i64> %res
2597}
2598
2599define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
2600; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
2601; GENERIC:       # %bb.0:
2602; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
2603; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2604; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
2605; GENERIC-NEXT:    retq # sched: [1:1.00]
2606;
2607; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
2608; SKX:       # %bb.0:
2609; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
2610; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2611; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
2612; SKX-NEXT:    retq # sched: [7:1.00]
2613  %vec = load <8 x i64>, <8 x i64>* %vp
2614  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
2615  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2616  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2617  ret <8 x i64> %res
2618}
2619
2620define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
2621; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
2622; GENERIC:       # %bb.0:
2623; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2624; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
2625; GENERIC-NEXT:    retq # sched: [1:1.00]
2626;
2627; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
2628; SKX:       # %bb.0:
2629; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2630; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
2631; SKX-NEXT:    retq # sched: [7:1.00]
2632  %vec = load <8 x i64>, <8 x i64>* %vp
2633  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
2634  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2635  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
2636  ret <8 x i64> %res
2637}
2638
2639define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
2640; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
2641; GENERIC:       # %bb.0:
2642; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
2643; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
2644; GENERIC-NEXT:    retq # sched: [1:1.00]
2645;
2646; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
2647; SKX:       # %bb.0:
2648; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
2649; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
2650; SKX-NEXT:    retq # sched: [7:1.00]
2651  %vec = load <8 x i64>, <8 x i64>* %vp
2652  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
2653  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2654  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
2655  ret <8 x i64> %res
2656}
2657
2658define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
2659; GENERIC-LABEL: test_8xfloat_perm_mask0:
2660; GENERIC:       # %bb.0:
2661; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
2662; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
2663; GENERIC-NEXT:    retq # sched: [1:1.00]
2664;
2665; SKX-LABEL: test_8xfloat_perm_mask0:
2666; SKX:       # %bb.0:
2667; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
2668; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
2669; SKX-NEXT:    retq # sched: [7:1.00]
2670  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
2671  ret <8 x float> %res
2672}
2673define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
2674; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
2675; GENERIC:       # %bb.0:
2676; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
2677; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
2678; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
2679; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
2680; GENERIC-NEXT:    retq # sched: [1:1.00]
2681;
2682; SKX-LABEL: test_masked_8xfloat_perm_mask0:
2683; SKX:       # %bb.0:
2684; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
2685; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
2686; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
2687; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
2688; SKX-NEXT:    retq # sched: [7:1.00]
2689  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
2690  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2691  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2692  ret <8 x float> %res
2693}
2694
2695define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> %mask) {
2696; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
2697; GENERIC:       # %bb.0:
2698; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
2699; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
2700; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
2701; GENERIC-NEXT:    retq # sched: [1:1.00]
2702;
2703; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
2704; SKX:       # %bb.0:
2705; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
2706; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
2707; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
2708; SKX-NEXT:    retq # sched: [7:1.00]
2709  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
2710  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2711  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2712  ret <8 x float> %res
2713}
2714define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
2715; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
2716; GENERIC:       # %bb.0:
2717; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
2718; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
2719; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
2720; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
2721; GENERIC-NEXT:    retq # sched: [1:1.00]
2722;
2723; SKX-LABEL: test_masked_8xfloat_perm_mask1:
2724; SKX:       # %bb.0:
2725; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
2726; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
2727; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
2728; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
2729; SKX-NEXT:    retq # sched: [7:1.00]
2730  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
2731  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2732  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2733  ret <8 x float> %res
2734}
2735
2736define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> %mask) {
2737; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
2738; GENERIC:       # %bb.0:
2739; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
2740; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
2741; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
2742; GENERIC-NEXT:    retq # sched: [1:1.00]
2743;
2744; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
2745; SKX:       # %bb.0:
2746; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
2747; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
2748; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
2749; SKX-NEXT:    retq # sched: [7:1.00]
2750  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
2751  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
2752  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2753  ret <8 x float> %res
2754}
2755define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
2756; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
2757; GENERIC:       # %bb.0:
2758; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
2759; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
2760; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
2761; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
2762; GENERIC-NEXT:    retq # sched: [1:1.00]
2763;
2764; SKX-LABEL: test_masked_8xfloat_perm_mask2:
2765; SKX:       # %bb.0:
2766; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
2767; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
2768; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
2769; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
2770; SKX-NEXT:    retq # sched: [7:1.00]
2771  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
2772  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2773  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2774  ret <8 x float> %res
2775}
2776
2777define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> %mask) {
2778; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
2779; GENERIC:       # %bb.0:
2780; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
2781; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
2782; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
2783; GENERIC-NEXT:    retq # sched: [1:1.00]
2784;
2785; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
2786; SKX:       # %bb.0:
2787; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
2788; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
2789; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
2790; SKX-NEXT:    retq # sched: [7:1.00]
2791  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
2792  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2793  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2794  ret <8 x float> %res
2795}
2796define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
2797; GENERIC-LABEL: test_8xfloat_perm_mask3:
2798; GENERIC:       # %bb.0:
2799; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
2800; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
2801; GENERIC-NEXT:    retq # sched: [1:1.00]
2802;
2803; SKX-LABEL: test_8xfloat_perm_mask3:
2804; SKX:       # %bb.0:
2805; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
2806; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
2807; SKX-NEXT:    retq # sched: [7:1.00]
2808  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
2809  ret <8 x float> %res
2810}
2811define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
2812; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
2813; GENERIC:       # %bb.0:
2814; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
2815; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
2816; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
2817; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
2818; GENERIC-NEXT:    retq # sched: [1:1.00]
2819;
2820; SKX-LABEL: test_masked_8xfloat_perm_mask3:
2821; SKX:       # %bb.0:
2822; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
2823; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
2824; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
2825; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
2826; SKX-NEXT:    retq # sched: [7:1.00]
2827  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
2828  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2829  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2830  ret <8 x float> %res
2831}
2832
2833define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> %mask) {
2834; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
2835; GENERIC:       # %bb.0:
2836; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
2837; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
2838; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
2839; GENERIC-NEXT:    retq # sched: [1:1.00]
2840;
2841; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
2842; SKX:       # %bb.0:
2843; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
2844; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
2845; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
2846; SKX-NEXT:    retq # sched: [7:1.00]
2847  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
2848  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2849  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2850  ret <8 x float> %res
2851}
2852define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
2853; GENERIC-LABEL: test_8xfloat_perm_mem_mask0:
2854; GENERIC:       # %bb.0:
2855; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
2856; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
2857; GENERIC-NEXT:    retq # sched: [1:1.00]
2858;
2859; SKX-LABEL: test_8xfloat_perm_mem_mask0:
2860; SKX:       # %bb.0:
2861; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
2862; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
2863; SKX-NEXT:    retq # sched: [7:1.00]
2864  %vec = load <8 x float>, <8 x float>* %vp
2865  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
2866  ret <8 x float> %res
2867}
2868define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
2869; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
2870; GENERIC:       # %bb.0:
2871; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
2872; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
2873; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
2874; GENERIC-NEXT:    retq # sched: [1:1.00]
2875;
2876; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
2877; SKX:       # %bb.0:
2878; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
2879; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
2880; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
2881; SKX-NEXT:    retq # sched: [7:1.00]
2882  %vec = load <8 x float>, <8 x float>* %vp
2883  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
2884  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2885  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2886  ret <8 x float> %res
2887}
2888
2889define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x i32> %mask) {
2890; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
2891; GENERIC:       # %bb.0:
2892; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
2893; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
2894; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
2895; GENERIC-NEXT:    retq # sched: [1:1.00]
2896;
2897; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
2898; SKX:       # %bb.0:
2899; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
2900; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
2901; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
2902; SKX-NEXT:    retq # sched: [7:1.00]
2903  %vec = load <8 x float>, <8 x float>* %vp
2904  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
2905  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2906  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2907  ret <8 x float> %res
2908}
2909
2910define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
2911; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
2912; GENERIC:       # %bb.0:
2913; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
2914; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
2915; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
2916; GENERIC-NEXT:    retq # sched: [1:1.00]
2917;
2918; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
2919; SKX:       # %bb.0:
2920; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
2921; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
2922; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
2923; SKX-NEXT:    retq # sched: [7:1.00]
2924  %vec = load <8 x float>, <8 x float>* %vp
2925  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
2926  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2927  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2928  ret <8 x float> %res
2929}
2930
2931define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x i32> %mask) {
2932; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
2933; GENERIC:       # %bb.0:
2934; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
2935; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
2936; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
2937; GENERIC-NEXT:    retq # sched: [1:1.00]
2938;
2939; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
2940; SKX:       # %bb.0:
2941; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
2942; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
2943; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
2944; SKX-NEXT:    retq # sched: [7:1.00]
2945  %vec = load <8 x float>, <8 x float>* %vp
2946  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
2947  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2948  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2949  ret <8 x float> %res
2950}
2951
2952define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
2953; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
2954; GENERIC:       # %bb.0:
2955; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
2956; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
2957; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
2958; GENERIC-NEXT:    retq # sched: [1:1.00]
2959;
2960; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
2961; SKX:       # %bb.0:
2962; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
2963; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
2964; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
2965; SKX-NEXT:    retq # sched: [7:1.00]
2966  %vec = load <8 x float>, <8 x float>* %vp
2967  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
2968  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2969  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
2970  ret <8 x float> %res
2971}
2972
2973define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x i32> %mask) {
2974; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
2975; GENERIC:       # %bb.0:
2976; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
2977; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
2978; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
2979; GENERIC-NEXT:    retq # sched: [1:1.00]
2980;
2981; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
2982; SKX:       # %bb.0:
2983; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
2984; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
2985; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
2986; SKX-NEXT:    retq # sched: [7:1.00]
2987  %vec = load <8 x float>, <8 x float>* %vp
2988  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
2989  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
2990  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
2991  ret <8 x float> %res
2992}
2993
2994define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
2995; GENERIC-LABEL: test_8xfloat_perm_mem_mask3:
2996; GENERIC:       # %bb.0:
2997; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
2998; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
2999; GENERIC-NEXT:    retq # sched: [1:1.00]
3000;
3001; SKX-LABEL: test_8xfloat_perm_mem_mask3:
3002; SKX:       # %bb.0:
3003; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
3004; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
3005; SKX-NEXT:    retq # sched: [7:1.00]
3006  %vec = load <8 x float>, <8 x float>* %vp
3007  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
3008  ret <8 x float> %res
3009}
3010define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
3011; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
3012; GENERIC:       # %bb.0:
3013; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
3014; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
3015; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
3016; GENERIC-NEXT:    retq # sched: [1:1.00]
3017;
3018; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
3019; SKX:       # %bb.0:
3020; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
3021; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
3022; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
3023; SKX-NEXT:    retq # sched: [7:1.00]
3024  %vec = load <8 x float>, <8 x float>* %vp
3025  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
3026  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
3027  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
3028  ret <8 x float> %res
3029}
3030
3031define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
3032; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
3033; GENERIC:       # %bb.0:
3034; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
3035; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
3036; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
3037; GENERIC-NEXT:    retq # sched: [1:1.00]
3038;
3039; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
3040; SKX:       # %bb.0:
3041; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
3042; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
3043; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
3044; SKX-NEXT:    retq # sched: [7:1.00]
3045  %vec = load <8 x float>, <8 x float>* %vp
3046  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
3047  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
3048  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
3049  ret <8 x float> %res
3050}
3051
3052define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
3053; GENERIC-LABEL: test_16xfloat_perm_mask0:
3054; GENERIC:       # %bb.0:
3055; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
3056; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
3057; GENERIC-NEXT:    retq # sched: [1:1.00]
3058;
3059; SKX-LABEL: test_16xfloat_perm_mask0:
3060; SKX:       # %bb.0:
3061; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
3062; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
3063; SKX-NEXT:    retq # sched: [7:1.00]
3064  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
3065  ret <16 x float> %res
3066}
3067define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
3068; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
3069; GENERIC:       # %bb.0:
3070; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
3071; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
3072; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
3073; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
3074; GENERIC-NEXT:    retq # sched: [1:1.00]
3075;
3076; SKX-LABEL: test_masked_16xfloat_perm_mask0:
3077; SKX:       # %bb.0:
3078; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
3079; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
3080; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
3081; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
3082; SKX-NEXT:    retq # sched: [7:1.00]
3083  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
3084  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3085  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3086  ret <16 x float> %res
3087}
3088
3089define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) {
3090; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
3091; GENERIC:       # %bb.0:
3092; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
3093; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3094; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
3095; GENERIC-NEXT:    retq # sched: [1:1.00]
3096;
3097; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
3098; SKX:       # %bb.0:
3099; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
3100; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3101; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
3102; SKX-NEXT:    retq # sched: [7:1.00]
3103  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
3104  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3105  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3106  ret <16 x float> %res
3107}
3108define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
3109; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
3110; GENERIC:       # %bb.0:
3111; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
3112; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
3113; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
3114; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
3115; GENERIC-NEXT:    retq # sched: [1:1.00]
3116;
3117; SKX-LABEL: test_masked_16xfloat_perm_mask1:
3118; SKX:       # %bb.0:
3119; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
3120; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
3121; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
3122; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
3123; SKX-NEXT:    retq # sched: [7:1.00]
3124  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
3125  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3126  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3127  ret <16 x float> %res
3128}
3129
3130define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) {
3131; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
3132; GENERIC:       # %bb.0:
3133; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
3134; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3135; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
3136; GENERIC-NEXT:    retq # sched: [1:1.00]
3137;
3138; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
3139; SKX:       # %bb.0:
3140; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
3141; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3142; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
3143; SKX-NEXT:    retq # sched: [7:1.00]
3144  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
3145  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3146  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3147  ret <16 x float> %res
3148}
3149define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
3150; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
3151; GENERIC:       # %bb.0:
3152; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
3153; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
3154; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
3155; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
3156; GENERIC-NEXT:    retq # sched: [1:1.00]
3157;
3158; SKX-LABEL: test_masked_16xfloat_perm_mask2:
3159; SKX:       # %bb.0:
3160; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
3161; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
3162; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
3163; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
3164; SKX-NEXT:    retq # sched: [7:1.00]
3165  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
3166  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3167  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3168  ret <16 x float> %res
3169}
3170
3171define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) {
3172; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
3173; GENERIC:       # %bb.0:
3174; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
3175; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3176; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
3177; GENERIC-NEXT:    retq # sched: [1:1.00]
3178;
3179; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
3180; SKX:       # %bb.0:
3181; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
3182; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3183; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
3184; SKX-NEXT:    retq # sched: [7:1.00]
3185  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
3186  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3187  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3188  ret <16 x float> %res
3189}
3190define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
3191; GENERIC-LABEL: test_16xfloat_perm_mask3:
3192; GENERIC:       # %bb.0:
3193; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
3194; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
3195; GENERIC-NEXT:    retq # sched: [1:1.00]
3196;
3197; SKX-LABEL: test_16xfloat_perm_mask3:
3198; SKX:       # %bb.0:
3199; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
3200; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
3201; SKX-NEXT:    retq # sched: [7:1.00]
3202  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
3203  ret <16 x float> %res
3204}
3205define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
3206; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
3207; GENERIC:       # %bb.0:
3208; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
3209; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
3210; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
3211; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
3212; GENERIC-NEXT:    retq # sched: [1:1.00]
3213;
3214; SKX-LABEL: test_masked_16xfloat_perm_mask3:
3215; SKX:       # %bb.0:
3216; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
3217; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
3218; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
3219; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
3220; SKX-NEXT:    retq # sched: [7:1.00]
3221  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
3222  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3223  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3224  ret <16 x float> %res
3225}
3226
3227define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) {
3228; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
3229; GENERIC:       # %bb.0:
3230; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
3231; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3232; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
3233; GENERIC-NEXT:    retq # sched: [1:1.00]
3234;
3235; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
3236; SKX:       # %bb.0:
3237; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
3238; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3239; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
3240; SKX-NEXT:    retq # sched: [7:1.00]
3241  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
3242  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3243  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3244  ret <16 x float> %res
3245}
3246define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
3247; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
3248; GENERIC:       # %bb.0:
3249; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
3250; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
3251; GENERIC-NEXT:    retq # sched: [1:1.00]
3252;
3253; SKX-LABEL: test_16xfloat_perm_mem_mask0:
3254; SKX:       # %bb.0:
3255; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
3256; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
3257; SKX-NEXT:    retq # sched: [7:1.00]
3258  %vec = load <16 x float>, <16 x float>* %vp
3259  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
3260  ret <16 x float> %res
3261}
3262define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
3263; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
3264; GENERIC:       # %bb.0:
3265; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
3266; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3267; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
3268; GENERIC-NEXT:    retq # sched: [1:1.00]
3269;
3270; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
3271; SKX:       # %bb.0:
3272; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
3273; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3274; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
3275; SKX-NEXT:    retq # sched: [7:1.00]
3276  %vec = load <16 x float>, <16 x float>* %vp
3277  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
3278  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3279  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3280  ret <16 x float> %res
3281}
3282
3283define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) {
3284; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
3285; GENERIC:       # %bb.0:
3286; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
3287; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
3288; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
3289; GENERIC-NEXT:    retq # sched: [1:1.00]
3290;
3291; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
3292; SKX:       # %bb.0:
3293; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
3294; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
3295; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
3296; SKX-NEXT:    retq # sched: [7:1.00]
3297  %vec = load <16 x float>, <16 x float>* %vp
3298  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
3299  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3300  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3301  ret <16 x float> %res
3302}
3303
3304define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
3305; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
3306; GENERIC:       # %bb.0:
3307; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
3308; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3309; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
3310; GENERIC-NEXT:    retq # sched: [1:1.00]
3311;
3312; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
3313; SKX:       # %bb.0:
3314; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
3315; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3316; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
3317; SKX-NEXT:    retq # sched: [7:1.00]
3318  %vec = load <16 x float>, <16 x float>* %vp
3319  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
3320  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3321  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3322  ret <16 x float> %res
3323}
3324
3325define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) {
3326; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
3327; GENERIC:       # %bb.0:
3328; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
3329; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
3330; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
3331; GENERIC-NEXT:    retq # sched: [1:1.00]
3332;
3333; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
3334; SKX:       # %bb.0:
3335; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
3336; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
3337; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
3338; SKX-NEXT:    retq # sched: [7:1.00]
3339  %vec = load <16 x float>, <16 x float>* %vp
3340  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
3341  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3342  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3343  ret <16 x float> %res
3344}
3345
3346define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
3347; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
3348; GENERIC:       # %bb.0:
3349; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
3350; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3351; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
3352; GENERIC-NEXT:    retq # sched: [1:1.00]
3353;
3354; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
3355; SKX:       # %bb.0:
3356; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
3357; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3358; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
3359; SKX-NEXT:    retq # sched: [7:1.00]
3360  %vec = load <16 x float>, <16 x float>* %vp
3361  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
3362  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3363  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3364  ret <16 x float> %res
3365}
3366
3367define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) {
3368; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
3369; GENERIC:       # %bb.0:
3370; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
3371; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
3372; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
3373; GENERIC-NEXT:    retq # sched: [1:1.00]
3374;
3375; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
3376; SKX:       # %bb.0:
3377; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
3378; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
3379; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
3380; SKX-NEXT:    retq # sched: [7:1.00]
3381  %vec = load <16 x float>, <16 x float>* %vp
3382  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
3383  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3384  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3385  ret <16 x float> %res
3386}
3387
3388define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
3389; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
3390; GENERIC:       # %bb.0:
3391; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
3392; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
3393; GENERIC-NEXT:    retq # sched: [1:1.00]
3394;
3395; SKX-LABEL: test_16xfloat_perm_mem_mask3:
3396; SKX:       # %bb.0:
3397; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
3398; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
3399; SKX-NEXT:    retq # sched: [7:1.00]
3400  %vec = load <16 x float>, <16 x float>* %vp
3401  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
3402  ret <16 x float> %res
3403}
3404define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
3405; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
3406; GENERIC:       # %bb.0:
3407; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
3408; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
3409; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
3410; GENERIC-NEXT:    retq # sched: [1:1.00]
3411;
3412; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
3413; SKX:       # %bb.0:
3414; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
3415; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
3416; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
3417; SKX-NEXT:    retq # sched: [7:1.00]
3418  %vec = load <16 x float>, <16 x float>* %vp
3419  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
3420  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3421  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
3422  ret <16 x float> %res
3423}
3424
3425define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) {
3426; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
3427; GENERIC:       # %bb.0:
3428; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
3429; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
3430; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
3431; GENERIC-NEXT:    retq # sched: [1:1.00]
3432;
3433; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
3434; SKX:       # %bb.0:
3435; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
3436; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
3437; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
3438; SKX-NEXT:    retq # sched: [7:1.00]
3439  %vec = load <16 x float>, <16 x float>* %vp
3440  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
3441  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
3442  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
3443  ret <16 x float> %res
3444}
3445
3446define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
3447; GENERIC-LABEL: test_4xdouble_perm_mask0:
3448; GENERIC:       # %bb.0:
3449; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [1:1.00]
3450; GENERIC-NEXT:    retq # sched: [1:1.00]
3451;
3452; SKX-LABEL: test_4xdouble_perm_mask0:
3453; SKX:       # %bb.0:
3454; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
3455; SKX-NEXT:    retq # sched: [7:1.00]
3456  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
3457  ret <4 x double> %res
3458}
3459define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
3460; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
3461; GENERIC:       # %bb.0:
3462; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
3463; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00]
3464; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
3465; GENERIC-NEXT:    retq # sched: [1:1.00]
3466;
3467; SKX-LABEL: test_masked_4xdouble_perm_mask0:
3468; SKX:       # %bb.0:
3469; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
3470; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
3471; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
3472; SKX-NEXT:    retq # sched: [7:1.00]
3473  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
3474  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3475  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3476  ret <4 x double> %res
3477}
3478
3479define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
3480; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
3481; GENERIC:       # %bb.0:
3482; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3483; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00]
3484; GENERIC-NEXT:    retq # sched: [1:1.00]
3485;
3486; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
3487; SKX:       # %bb.0:
3488; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3489; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
3490; SKX-NEXT:    retq # sched: [7:1.00]
3491  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
3492  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3493  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3494  ret <4 x double> %res
3495}
3496define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
3497; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
3498; GENERIC:       # %bb.0:
3499; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
3500; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00]
3501; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
3502; GENERIC-NEXT:    retq # sched: [1:1.00]
3503;
3504; SKX-LABEL: test_masked_4xdouble_perm_mask1:
3505; SKX:       # %bb.0:
3506; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
3507; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
3508; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
3509; SKX-NEXT:    retq # sched: [7:1.00]
3510  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
3511  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3512  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3513  ret <4 x double> %res
3514}
3515
3516define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
3517; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
3518; GENERIC:       # %bb.0:
3519; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3520; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00]
3521; GENERIC-NEXT:    retq # sched: [1:1.00]
3522;
3523; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
3524; SKX:       # %bb.0:
3525; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3526; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
3527; SKX-NEXT:    retq # sched: [7:1.00]
3528  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
3529  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3530  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3531  ret <4 x double> %res
3532}
3533define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
3534; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
3535; GENERIC:       # %bb.0:
3536; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
3537; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00]
3538; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
3539; GENERIC-NEXT:    retq # sched: [1:1.00]
3540;
3541; SKX-LABEL: test_masked_4xdouble_perm_mask2:
3542; SKX:       # %bb.0:
3543; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
3544; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
3545; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
3546; SKX-NEXT:    retq # sched: [7:1.00]
3547  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
3548  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3549  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3550  ret <4 x double> %res
3551}
3552
3553define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
3554; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
3555; GENERIC:       # %bb.0:
3556; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3557; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00]
3558; GENERIC-NEXT:    retq # sched: [1:1.00]
3559;
3560; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
3561; SKX:       # %bb.0:
3562; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3563; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
3564; SKX-NEXT:    retq # sched: [7:1.00]
3565  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
3566  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3567  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3568  ret <4 x double> %res
3569}
3570define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
3571; GENERIC-LABEL: test_4xdouble_perm_mask3:
3572; GENERIC:       # %bb.0:
3573; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [1:1.00]
3574; GENERIC-NEXT:    retq # sched: [1:1.00]
3575;
3576; SKX-LABEL: test_4xdouble_perm_mask3:
3577; SKX:       # %bb.0:
3578; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
3579; SKX-NEXT:    retq # sched: [7:1.00]
3580  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
3581  ret <4 x double> %res
3582}
3583define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
3584; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
3585; GENERIC:       # %bb.0:
3586; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
3587; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00]
3588; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
3589; GENERIC-NEXT:    retq # sched: [1:1.00]
3590;
3591; SKX-LABEL: test_masked_4xdouble_perm_mask3:
3592; SKX:       # %bb.0:
3593; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
3594; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
3595; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
3596; SKX-NEXT:    retq # sched: [7:1.00]
3597  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
3598  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3599  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3600  ret <4 x double> %res
3601}
3602
3603define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
3604; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
3605; GENERIC:       # %bb.0:
3606; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3607; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00]
3608; GENERIC-NEXT:    retq # sched: [1:1.00]
3609;
3610; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
3611; SKX:       # %bb.0:
3612; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3613; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
3614; SKX-NEXT:    retq # sched: [7:1.00]
3615  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
3616  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3617  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3618  ret <4 x double> %res
3619}
3620define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
3621; GENERIC-LABEL: test_4xdouble_perm_mem_mask0:
3622; GENERIC:       # %bb.0:
3623; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [8:1.00]
3624; GENERIC-NEXT:    retq # sched: [1:1.00]
3625;
3626; SKX-LABEL: test_4xdouble_perm_mem_mask0:
3627; SKX:       # %bb.0:
3628; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00]
3629; SKX-NEXT:    retq # sched: [7:1.00]
3630  %vec = load <4 x double>, <4 x double>* %vp
3631  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
3632  ret <4 x double> %res
3633}
3634define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
3635; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
3636; GENERIC:       # %bb.0:
3637; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3638; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [8:1.00]
3639; GENERIC-NEXT:    retq # sched: [1:1.00]
3640;
3641; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
3642; SKX:       # %bb.0:
3643; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3644; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
3645; SKX-NEXT:    retq # sched: [7:1.00]
3646  %vec = load <4 x double>, <4 x double>* %vp
3647  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
3648  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3649  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3650  ret <4 x double> %res
3651}
3652
3653define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
3654; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
3655; GENERIC:       # %bb.0:
3656; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
3657; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [8:1.00]
3658; GENERIC-NEXT:    retq # sched: [1:1.00]
3659;
3660; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
3661; SKX:       # %bb.0:
3662; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
3663; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
3664; SKX-NEXT:    retq # sched: [7:1.00]
3665  %vec = load <4 x double>, <4 x double>* %vp
3666  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
3667  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3668  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3669  ret <4 x double> %res
3670}
3671
3672define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
3673; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
3674; GENERIC:       # %bb.0:
3675; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3676; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [8:1.00]
3677; GENERIC-NEXT:    retq # sched: [1:1.00]
3678;
3679; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
3680; SKX:       # %bb.0:
3681; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3682; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
3683; SKX-NEXT:    retq # sched: [7:1.00]
3684  %vec = load <4 x double>, <4 x double>* %vp
3685  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
3686  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3687  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3688  ret <4 x double> %res
3689}
3690
3691define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
3692; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
3693; GENERIC:       # %bb.0:
3694; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
3695; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [8:1.00]
3696; GENERIC-NEXT:    retq # sched: [1:1.00]
3697;
3698; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
3699; SKX:       # %bb.0:
3700; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
3701; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
3702; SKX-NEXT:    retq # sched: [7:1.00]
3703  %vec = load <4 x double>, <4 x double>* %vp
3704  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
3705  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3706  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3707  ret <4 x double> %res
3708}
3709
3710define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
3711; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
3712; GENERIC:       # %bb.0:
3713; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3714; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [8:1.00]
3715; GENERIC-NEXT:    retq # sched: [1:1.00]
3716;
3717; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
3718; SKX:       # %bb.0:
3719; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3720; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
3721; SKX-NEXT:    retq # sched: [7:1.00]
3722  %vec = load <4 x double>, <4 x double>* %vp
3723  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
3724  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3725  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3726  ret <4 x double> %res
3727}
3728
3729define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
3730; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
3731; GENERIC:       # %bb.0:
3732; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
3733; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [8:1.00]
3734; GENERIC-NEXT:    retq # sched: [1:1.00]
3735;
3736; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
3737; SKX:       # %bb.0:
3738; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
3739; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
3740; SKX-NEXT:    retq # sched: [7:1.00]
3741  %vec = load <4 x double>, <4 x double>* %vp
3742  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
3743  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3744  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3745  ret <4 x double> %res
3746}
3747
3748define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
3749; GENERIC-LABEL: test_4xdouble_perm_mem_mask3:
3750; GENERIC:       # %bb.0:
3751; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [8:1.00]
3752; GENERIC-NEXT:    retq # sched: [1:1.00]
3753;
3754; SKX-LABEL: test_4xdouble_perm_mem_mask3:
3755; SKX:       # %bb.0:
3756; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00]
3757; SKX-NEXT:    retq # sched: [7:1.00]
3758  %vec = load <4 x double>, <4 x double>* %vp
3759  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
3760  ret <4 x double> %res
3761}
3762define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
3763; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
3764; GENERIC:       # %bb.0:
3765; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
3766; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [8:1.00]
3767; GENERIC-NEXT:    retq # sched: [1:1.00]
3768;
3769; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
3770; SKX:       # %bb.0:
3771; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
3772; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
3773; SKX-NEXT:    retq # sched: [7:1.00]
3774  %vec = load <4 x double>, <4 x double>* %vp
3775  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
3776  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3777  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
3778  ret <4 x double> %res
3779}
3780
3781define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
3782; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
3783; GENERIC:       # %bb.0:
3784; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
3785; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [8:1.00]
3786; GENERIC-NEXT:    retq # sched: [1:1.00]
3787;
3788; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
3789; SKX:       # %bb.0:
3790; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
3791; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
3792; SKX-NEXT:    retq # sched: [7:1.00]
3793  %vec = load <4 x double>, <4 x double>* %vp
3794  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
3795  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
3796  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
3797  ret <4 x double> %res
3798}
3799
3800define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
3801; GENERIC-LABEL: test_8xdouble_perm_mask0:
3802; GENERIC:       # %bb.0:
3803; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
3804; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
3805; GENERIC-NEXT:    retq # sched: [1:1.00]
3806;
3807; SKX-LABEL: test_8xdouble_perm_mask0:
3808; SKX:       # %bb.0:
3809; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
3810; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
3811; SKX-NEXT:    retq # sched: [7:1.00]
3812  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
3813  ret <8 x double> %res
3814}
3815define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
3816; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
3817; GENERIC:       # %bb.0:
3818; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
3819; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
3820; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
3821; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
3822; GENERIC-NEXT:    retq # sched: [1:1.00]
3823;
3824; SKX-LABEL: test_masked_8xdouble_perm_mask0:
3825; SKX:       # %bb.0:
3826; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
3827; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
3828; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
3829; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
3830; SKX-NEXT:    retq # sched: [7:1.00]
3831  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
3832  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3833  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
3834  ret <8 x double> %res
3835}
3836
3837define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) {
3838; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
3839; GENERIC:       # %bb.0:
3840; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
3841; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
3842; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
3843; GENERIC-NEXT:    retq # sched: [1:1.00]
3844;
3845; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
3846; SKX:       # %bb.0:
3847; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
3848; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
3849; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
3850; SKX-NEXT:    retq # sched: [7:1.00]
3851  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
3852  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3853  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
3854  ret <8 x double> %res
3855}
3856define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
3857; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
3858; GENERIC:       # %bb.0:
3859; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
3860; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
3861; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
3862; GENERIC-NEXT:    retq # sched: [1:1.00]
3863;
3864; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
3865; SKX:       # %bb.0:
3866; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
3867; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
3868; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
3869; SKX-NEXT:    retq # sched: [7:1.00]
3870  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
3871  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3872  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
3873  ret <8 x double> %res
3874}
3875
3876define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
3877; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
3878; GENERIC:       # %bb.0:
3879; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
3880; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
3881; GENERIC-NEXT:    retq # sched: [1:1.00]
3882;
3883; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
3884; SKX:       # %bb.0:
3885; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
3886; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
3887; SKX-NEXT:    retq # sched: [7:1.00]
3888  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
3889  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3890  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
3891  ret <8 x double> %res
3892}
3893define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
3894; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
3895; GENERIC:       # %bb.0:
3896; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
3897; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
3898; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
3899; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
3900; GENERIC-NEXT:    retq # sched: [1:1.00]
3901;
3902; SKX-LABEL: test_masked_8xdouble_perm_mask2:
3903; SKX:       # %bb.0:
3904; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
3905; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
3906; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
3907; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
3908; SKX-NEXT:    retq # sched: [7:1.00]
3909  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
3910  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3911  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
3912  ret <8 x double> %res
3913}
3914
3915define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) {
3916; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
3917; GENERIC:       # %bb.0:
3918; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
3919; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
3920; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
3921; GENERIC-NEXT:    retq # sched: [1:1.00]
3922;
3923; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
3924; SKX:       # %bb.0:
3925; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
3926; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
3927; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
3928; SKX-NEXT:    retq # sched: [7:1.00]
3929  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
3930  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3931  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
3932  ret <8 x double> %res
3933}
3934define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
3935; GENERIC-LABEL: test_8xdouble_perm_imm_mask3:
3936; GENERIC:       # %bb.0:
3937; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
3938; GENERIC-NEXT:    retq # sched: [1:1.00]
3939;
3940; SKX-LABEL: test_8xdouble_perm_imm_mask3:
3941; SKX:       # %bb.0:
3942; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
3943; SKX-NEXT:    retq # sched: [7:1.00]
3944  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
3945  ret <8 x double> %res
3946}
3947define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
3948; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
3949; GENERIC:       # %bb.0:
3950; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
3951; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
3952; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
3953; GENERIC-NEXT:    retq # sched: [1:1.00]
3954;
3955; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
3956; SKX:       # %bb.0:
3957; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
3958; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
3959; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
3960; SKX-NEXT:    retq # sched: [7:1.00]
3961  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
3962  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3963  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
3964  ret <8 x double> %res
3965}
3966
3967define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
3968; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
3969; GENERIC:       # %bb.0:
3970; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
3971; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
3972; GENERIC-NEXT:    retq # sched: [1:1.00]
3973;
3974; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
3975; SKX:       # %bb.0:
3976; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
3977; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
3978; SKX-NEXT:    retq # sched: [7:1.00]
3979  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
3980  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
3981  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
3982  ret <8 x double> %res
3983}
3984define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
3985; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
3986; GENERIC:       # %bb.0:
3987; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
3988; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
3989; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
3990; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
3991; GENERIC-NEXT:    retq # sched: [1:1.00]
3992;
3993; SKX-LABEL: test_masked_8xdouble_perm_mask4:
3994; SKX:       # %bb.0:
3995; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
3996; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
3997; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
3998; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
3999; SKX-NEXT:    retq # sched: [7:1.00]
4000  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
4001  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4002  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4003  ret <8 x double> %res
4004}
4005
4006define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) {
4007; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
4008; GENERIC:       # %bb.0:
4009; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
4010; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4011; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
4012; GENERIC-NEXT:    retq # sched: [1:1.00]
4013;
4014; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
4015; SKX:       # %bb.0:
4016; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
4017; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4018; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
4019; SKX-NEXT:    retq # sched: [7:1.00]
4020  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
4021  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4022  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4023  ret <8 x double> %res
4024}
4025define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
4026; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
4027; GENERIC:       # %bb.0:
4028; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
4029; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
4030; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
4031; GENERIC-NEXT:    retq # sched: [1:1.00]
4032;
4033; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
4034; SKX:       # %bb.0:
4035; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
4036; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
4037; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
4038; SKX-NEXT:    retq # sched: [7:1.00]
4039  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
4040  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4041  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4042  ret <8 x double> %res
4043}
4044
4045define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
4046; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
4047; GENERIC:       # %bb.0:
4048; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4049; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
4050; GENERIC-NEXT:    retq # sched: [1:1.00]
4051;
4052; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
4053; SKX:       # %bb.0:
4054; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4055; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
4056; SKX-NEXT:    retq # sched: [7:1.00]
4057  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
4058  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4059  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4060  ret <8 x double> %res
4061}
4062define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
4063; GENERIC-LABEL: test_8xdouble_perm_mask6:
4064; GENERIC:       # %bb.0:
4065; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
4066; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
4067; GENERIC-NEXT:    retq # sched: [1:1.00]
4068;
4069; SKX-LABEL: test_8xdouble_perm_mask6:
4070; SKX:       # %bb.0:
4071; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
4072; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
4073; SKX-NEXT:    retq # sched: [7:1.00]
4074  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
4075  ret <8 x double> %res
4076}
4077define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
4078; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
4079; GENERIC:       # %bb.0:
4080; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
4081; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
4082; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
4083; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
4084; GENERIC-NEXT:    retq # sched: [1:1.00]
4085;
4086; SKX-LABEL: test_masked_8xdouble_perm_mask6:
4087; SKX:       # %bb.0:
4088; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
4089; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
4090; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
4091; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
4092; SKX-NEXT:    retq # sched: [7:1.00]
4093  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
4094  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4095  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4096  ret <8 x double> %res
4097}
4098
4099define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) {
4100; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
4101; GENERIC:       # %bb.0:
4102; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
4103; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4104; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
4105; GENERIC-NEXT:    retq # sched: [1:1.00]
4106;
4107; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
4108; SKX:       # %bb.0:
4109; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
4110; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4111; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
4112; SKX-NEXT:    retq # sched: [7:1.00]
4113  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
4114  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4115  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4116  ret <8 x double> %res
4117}
4118define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
4119; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
4120; GENERIC:       # %bb.0:
4121; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
4122; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
4123; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
4124; GENERIC-NEXT:    retq # sched: [1:1.00]
4125;
4126; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
4127; SKX:       # %bb.0:
4128; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
4129; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
4130; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
4131; SKX-NEXT:    retq # sched: [7:1.00]
4132  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
4133  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4134  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4135  ret <8 x double> %res
4136}
4137
4138define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
4139; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
4140; GENERIC:       # %bb.0:
4141; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4142; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
4143; GENERIC-NEXT:    retq # sched: [1:1.00]
4144;
4145; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
4146; SKX:       # %bb.0:
4147; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4148; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
4149; SKX-NEXT:    retq # sched: [7:1.00]
4150  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
4151  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4152  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4153  ret <8 x double> %res
4154}
4155define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
4156; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
4157; GENERIC:       # %bb.0:
4158; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
4159; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
4160; GENERIC-NEXT:    retq # sched: [1:1.00]
4161;
4162; SKX-LABEL: test_8xdouble_perm_mem_mask0:
4163; SKX:       # %bb.0:
4164; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
4165; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
4166; SKX-NEXT:    retq # sched: [7:1.00]
4167  %vec = load <8 x double>, <8 x double>* %vp
4168  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
4169  ret <8 x double> %res
4170}
4171define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4172; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
4173; GENERIC:       # %bb.0:
4174; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
4175; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4176; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
4177; GENERIC-NEXT:    retq # sched: [1:1.00]
4178;
4179; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
4180; SKX:       # %bb.0:
4181; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
4182; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4183; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
4184; SKX-NEXT:    retq # sched: [7:1.00]
4185  %vec = load <8 x double>, <8 x double>* %vp
4186  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
4187  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4188  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4189  ret <8 x double> %res
4190}
4191
4192define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) {
4193; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
4194; GENERIC:       # %bb.0:
4195; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
4196; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4197; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
4198; GENERIC-NEXT:    retq # sched: [1:1.00]
4199;
4200; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
4201; SKX:       # %bb.0:
4202; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
4203; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4204; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
4205; SKX-NEXT:    retq # sched: [7:1.00]
4206  %vec = load <8 x double>, <8 x double>* %vp
4207  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
4208  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4209  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4210  ret <8 x double> %res
4211}
4212
4213define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4214; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
4215; GENERIC:       # %bb.0:
4216; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4217; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
4218; GENERIC-NEXT:    retq # sched: [1:1.00]
4219;
4220; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
4221; SKX:       # %bb.0:
4222; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4223; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
4224; SKX-NEXT:    retq # sched: [7:1.00]
4225  %vec = load <8 x double>, <8 x double>* %vp
4226  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
4227  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4228  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4229  ret <8 x double> %res
4230}
4231
4232define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
4233; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
4234; GENERIC:       # %bb.0:
4235; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4236; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
4237; GENERIC-NEXT:    retq # sched: [1:1.00]
4238;
4239; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
4240; SKX:       # %bb.0:
4241; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4242; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
4243; SKX-NEXT:    retq # sched: [7:1.00]
4244  %vec = load <8 x double>, <8 x double>* %vp
4245  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
4246  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4247  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4248  ret <8 x double> %res
4249}
4250
4251define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4252; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
4253; GENERIC:       # %bb.0:
4254; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
4255; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4256; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
4257; GENERIC-NEXT:    retq # sched: [1:1.00]
4258;
4259; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
4260; SKX:       # %bb.0:
4261; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
4262; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4263; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
4264; SKX-NEXT:    retq # sched: [7:1.00]
4265  %vec = load <8 x double>, <8 x double>* %vp
4266  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
4267  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4268  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4269  ret <8 x double> %res
4270}
4271
4272define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) {
4273; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
4274; GENERIC:       # %bb.0:
4275; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
4276; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4277; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
4278; GENERIC-NEXT:    retq # sched: [1:1.00]
4279;
4280; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
4281; SKX:       # %bb.0:
4282; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
4283; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4284; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
4285; SKX-NEXT:    retq # sched: [7:1.00]
4286  %vec = load <8 x double>, <8 x double>* %vp
4287  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
4288  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4289  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4290  ret <8 x double> %res
4291}
4292
4293define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
4294; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3:
4295; GENERIC:       # %bb.0:
4296; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
4297; GENERIC-NEXT:    retq # sched: [1:1.00]
4298;
4299; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3:
4300; SKX:       # %bb.0:
4301; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
4302; SKX-NEXT:    retq # sched: [7:1.00]
4303  %vec = load <8 x double>, <8 x double>* %vp
4304  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
4305  ret <8 x double> %res
4306}
4307define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4308; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
4309; GENERIC:       # %bb.0:
4310; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4311; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
4312; GENERIC-NEXT:    retq # sched: [1:1.00]
4313;
4314; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
4315; SKX:       # %bb.0:
4316; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4317; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
4318; SKX-NEXT:    retq # sched: [7:1.00]
4319  %vec = load <8 x double>, <8 x double>* %vp
4320  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
4321  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4322  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4323  ret <8 x double> %res
4324}
4325
4326define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
4327; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
4328; GENERIC:       # %bb.0:
4329; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4330; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
4331; GENERIC-NEXT:    retq # sched: [1:1.00]
4332;
4333; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
4334; SKX:       # %bb.0:
4335; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4336; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
4337; SKX-NEXT:    retq # sched: [7:1.00]
4338  %vec = load <8 x double>, <8 x double>* %vp
4339  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
4340  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4341  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4342  ret <8 x double> %res
4343}
4344
4345define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4346; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
4347; GENERIC:       # %bb.0:
4348; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
4349; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4350; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
4351; GENERIC-NEXT:    retq # sched: [1:1.00]
4352;
4353; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
4354; SKX:       # %bb.0:
4355; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
4356; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4357; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
4358; SKX-NEXT:    retq # sched: [7:1.00]
4359  %vec = load <8 x double>, <8 x double>* %vp
4360  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
4361  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4362  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4363  ret <8 x double> %res
4364}
4365
4366define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) {
4367; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
4368; GENERIC:       # %bb.0:
4369; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
4370; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4371; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
4372; GENERIC-NEXT:    retq # sched: [1:1.00]
4373;
4374; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
4375; SKX:       # %bb.0:
4376; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
4377; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4378; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
4379; SKX-NEXT:    retq # sched: [7:1.00]
4380  %vec = load <8 x double>, <8 x double>* %vp
4381  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
4382  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4383  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4384  ret <8 x double> %res
4385}
4386
4387define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4388; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
4389; GENERIC:       # %bb.0:
4390; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4391; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
4392; GENERIC-NEXT:    retq # sched: [1:1.00]
4393;
4394; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
4395; SKX:       # %bb.0:
4396; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4397; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
4398; SKX-NEXT:    retq # sched: [7:1.00]
4399  %vec = load <8 x double>, <8 x double>* %vp
4400  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
4401  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4402  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4403  ret <8 x double> %res
4404}
4405
4406define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
4407; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
4408; GENERIC:       # %bb.0:
4409; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4410; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
4411; GENERIC-NEXT:    retq # sched: [1:1.00]
4412;
4413; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
4414; SKX:       # %bb.0:
4415; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4416; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
4417; SKX-NEXT:    retq # sched: [7:1.00]
4418  %vec = load <8 x double>, <8 x double>* %vp
4419  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
4420  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4421  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4422  ret <8 x double> %res
4423}
4424
4425define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
4426; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
4427; GENERIC:       # %bb.0:
4428; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
4429; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
4430; GENERIC-NEXT:    retq # sched: [1:1.00]
4431;
4432; SKX-LABEL: test_8xdouble_perm_mem_mask6:
4433; SKX:       # %bb.0:
4434; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
4435; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
4436; SKX-NEXT:    retq # sched: [7:1.00]
4437  %vec = load <8 x double>, <8 x double>* %vp
4438  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
4439  ret <8 x double> %res
4440}
4441define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4442; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
4443; GENERIC:       # %bb.0:
4444; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
4445; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4446; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
4447; GENERIC-NEXT:    retq # sched: [1:1.00]
4448;
4449; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
4450; SKX:       # %bb.0:
4451; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
4452; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4453; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
4454; SKX-NEXT:    retq # sched: [7:1.00]
4455  %vec = load <8 x double>, <8 x double>* %vp
4456  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
4457  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4458  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4459  ret <8 x double> %res
4460}
4461
4462define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) {
4463; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
4464; GENERIC:       # %bb.0:
4465; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
4466; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4467; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
4468; GENERIC-NEXT:    retq # sched: [1:1.00]
4469;
4470; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
4471; SKX:       # %bb.0:
4472; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
4473; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4474; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
4475; SKX-NEXT:    retq # sched: [7:1.00]
4476  %vec = load <8 x double>, <8 x double>* %vp
4477  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
4478  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4479  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4480  ret <8 x double> %res
4481}
4482
4483define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
4484; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
4485; GENERIC:       # %bb.0:
4486; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
4487; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
4488; GENERIC-NEXT:    retq # sched: [1:1.00]
4489;
4490; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
4491; SKX:       # %bb.0:
4492; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
4493; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
4494; SKX-NEXT:    retq # sched: [7:1.00]
4495  %vec = load <8 x double>, <8 x double>* %vp
4496  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
4497  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4498  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
4499  ret <8 x double> %res
4500}
4501
4502define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
4503; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
4504; GENERIC:       # %bb.0:
4505; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
4506; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
4507; GENERIC-NEXT:    retq # sched: [1:1.00]
4508;
4509; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
4510; SKX:       # %bb.0:
4511; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
4512; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
4513; SKX-NEXT:    retq # sched: [7:1.00]
4514  %vec = load <8 x double>, <8 x double>* %vp
4515  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
4516  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
4517  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
4518  ret <8 x double> %res
4519}
4520
4521define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
4522; GENERIC-LABEL: test_16xi8_perm_mask0:
4523; GENERIC:       # %bb.0:
4524; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
4525; GENERIC-NEXT:    retq # sched: [1:1.00]
4526;
4527; SKX-LABEL: test_16xi8_perm_mask0:
4528; SKX:       # %bb.0:
4529; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
4530; SKX-NEXT:    retq # sched: [7:1.00]
4531  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
4532  ret <16 x i8> %res
4533}
4534define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
4535; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
4536; GENERIC:       # %bb.0:
4537; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
4538; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
4539; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4540; GENERIC-NEXT:    retq # sched: [1:1.00]
4541;
4542; SKX-LABEL: test_masked_16xi8_perm_mask0:
4543; SKX:       # %bb.0:
4544; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
4545; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
4546; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4547; SKX-NEXT:    retq # sched: [7:1.00]
4548  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
4549  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4550  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4551  ret <16 x i8> %res
4552}
4553
4554define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
4555; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
4556; GENERIC:       # %bb.0:
4557; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4558; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
4559; GENERIC-NEXT:    retq # sched: [1:1.00]
4560;
4561; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
4562; SKX:       # %bb.0:
4563; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4564; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
4565; SKX-NEXT:    retq # sched: [7:1.00]
4566  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
4567  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4568  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4569  ret <16 x i8> %res
4570}
4571define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
4572; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
4573; GENERIC:       # %bb.0:
4574; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
4575; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
4576; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4577; GENERIC-NEXT:    retq # sched: [1:1.00]
4578;
4579; SKX-LABEL: test_masked_16xi8_perm_mask1:
4580; SKX:       # %bb.0:
4581; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
4582; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
4583; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4584; SKX-NEXT:    retq # sched: [7:1.00]
4585  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
4586  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4587  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4588  ret <16 x i8> %res
4589}
4590
4591define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
4592; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
4593; GENERIC:       # %bb.0:
4594; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4595; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
4596; GENERIC-NEXT:    retq # sched: [1:1.00]
4597;
4598; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
4599; SKX:       # %bb.0:
4600; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4601; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
4602; SKX-NEXT:    retq # sched: [7:1.00]
4603  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
4604  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4605  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4606  ret <16 x i8> %res
4607}
4608define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
4609; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
4610; GENERIC:       # %bb.0:
4611; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
4612; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
4613; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4614; GENERIC-NEXT:    retq # sched: [1:1.00]
4615;
4616; SKX-LABEL: test_masked_16xi8_perm_mask2:
4617; SKX:       # %bb.0:
4618; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
4619; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
4620; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4621; SKX-NEXT:    retq # sched: [7:1.00]
4622  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
4623  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4624  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4625  ret <16 x i8> %res
4626}
4627
4628define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
4629; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
4630; GENERIC:       # %bb.0:
4631; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4632; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
4633; GENERIC-NEXT:    retq # sched: [1:1.00]
4634;
4635; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
4636; SKX:       # %bb.0:
4637; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4638; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
4639; SKX-NEXT:    retq # sched: [7:1.00]
4640  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
4641  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4642  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4643  ret <16 x i8> %res
4644}
4645define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
4646; GENERIC-LABEL: test_16xi8_perm_mask3:
4647; GENERIC:       # %bb.0:
4648; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
4649; GENERIC-NEXT:    retq # sched: [1:1.00]
4650;
4651; SKX-LABEL: test_16xi8_perm_mask3:
4652; SKX:       # %bb.0:
4653; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
4654; SKX-NEXT:    retq # sched: [7:1.00]
4655  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
4656  ret <16 x i8> %res
4657}
4658define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
4659; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
4660; GENERIC:       # %bb.0:
4661; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
4662; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
4663; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4664; GENERIC-NEXT:    retq # sched: [1:1.00]
4665;
4666; SKX-LABEL: test_masked_16xi8_perm_mask3:
4667; SKX:       # %bb.0:
4668; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
4669; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
4670; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
4671; SKX-NEXT:    retq # sched: [7:1.00]
4672  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
4673  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4674  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4675  ret <16 x i8> %res
4676}
4677
4678define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
4679; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
4680; GENERIC:       # %bb.0:
4681; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4682; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
4683; GENERIC-NEXT:    retq # sched: [1:1.00]
4684;
4685; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
4686; SKX:       # %bb.0:
4687; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4688; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
4689; SKX-NEXT:    retq # sched: [7:1.00]
4690  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
4691  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4692  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4693  ret <16 x i8> %res
4694}
4695define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
4696; GENERIC-LABEL: test_16xi8_perm_mem_mask0:
4697; GENERIC:       # %bb.0:
4698; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
4699; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
4700; GENERIC-NEXT:    retq # sched: [1:1.00]
4701;
4702; SKX-LABEL: test_16xi8_perm_mem_mask0:
4703; SKX:       # %bb.0:
4704; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
4705; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
4706; SKX-NEXT:    retq # sched: [7:1.00]
4707  %vec = load <16 x i8>, <16 x i8>* %vp
4708  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
4709  ret <16 x i8> %res
4710}
4711define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
4712; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
4713; GENERIC:       # %bb.0:
4714; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4715; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4716; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
4717; GENERIC-NEXT:    retq # sched: [1:1.00]
4718;
4719; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
4720; SKX:       # %bb.0:
4721; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4722; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4723; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
4724; SKX-NEXT:    retq # sched: [7:1.00]
4725  %vec = load <16 x i8>, <16 x i8>* %vp
4726  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
4727  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4728  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4729  ret <16 x i8> %res
4730}
4731
4732define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
4733; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
4734; GENERIC:       # %bb.0:
4735; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4736; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
4737; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
4738; GENERIC-NEXT:    retq # sched: [1:1.00]
4739;
4740; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
4741; SKX:       # %bb.0:
4742; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4743; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
4744; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
4745; SKX-NEXT:    retq # sched: [7:1.00]
4746  %vec = load <16 x i8>, <16 x i8>* %vp
4747  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
4748  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4749  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4750  ret <16 x i8> %res
4751}
4752
4753define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
4754; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
4755; GENERIC:       # %bb.0:
4756; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4757; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4758; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
4759; GENERIC-NEXT:    retq # sched: [1:1.00]
4760;
4761; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
4762; SKX:       # %bb.0:
4763; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4764; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4765; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
4766; SKX-NEXT:    retq # sched: [7:1.00]
4767  %vec = load <16 x i8>, <16 x i8>* %vp
4768  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
4769  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4770  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4771  ret <16 x i8> %res
4772}
4773
4774define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
4775; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
4776; GENERIC:       # %bb.0:
4777; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4778; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
4779; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
4780; GENERIC-NEXT:    retq # sched: [1:1.00]
4781;
4782; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
4783; SKX:       # %bb.0:
4784; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4785; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
4786; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
4787; SKX-NEXT:    retq # sched: [7:1.00]
4788  %vec = load <16 x i8>, <16 x i8>* %vp
4789  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
4790  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4791  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4792  ret <16 x i8> %res
4793}
4794
4795define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
4796; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
4797; GENERIC:       # %bb.0:
4798; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4799; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4800; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
4801; GENERIC-NEXT:    retq # sched: [1:1.00]
4802;
4803; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
4804; SKX:       # %bb.0:
4805; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4806; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4807; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
4808; SKX-NEXT:    retq # sched: [7:1.00]
4809  %vec = load <16 x i8>, <16 x i8>* %vp
4810  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
4811  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4812  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4813  ret <16 x i8> %res
4814}
4815
4816define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
4817; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
4818; GENERIC:       # %bb.0:
4819; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4820; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
4821; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
4822; GENERIC-NEXT:    retq # sched: [1:1.00]
4823;
4824; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
4825; SKX:       # %bb.0:
4826; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4827; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
4828; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
4829; SKX-NEXT:    retq # sched: [7:1.00]
4830  %vec = load <16 x i8>, <16 x i8>* %vp
4831  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
4832  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4833  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4834  ret <16 x i8> %res
4835}
4836
4837define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
4838; GENERIC-LABEL: test_16xi8_perm_mem_mask3:
4839; GENERIC:       # %bb.0:
4840; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
4841; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
4842; GENERIC-NEXT:    retq # sched: [1:1.00]
4843;
4844; SKX-LABEL: test_16xi8_perm_mem_mask3:
4845; SKX:       # %bb.0:
4846; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
4847; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
4848; SKX-NEXT:    retq # sched: [7:1.00]
4849  %vec = load <16 x i8>, <16 x i8>* %vp
4850  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
4851  ret <16 x i8> %res
4852}
4853define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
4854; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
4855; GENERIC:       # %bb.0:
4856; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4857; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
4858; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
4859; GENERIC-NEXT:    retq # sched: [1:1.00]
4860;
4861; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
4862; SKX:       # %bb.0:
4863; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
4864; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
4865; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
4866; SKX-NEXT:    retq # sched: [7:1.00]
4867  %vec = load <16 x i8>, <16 x i8>* %vp
4868  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
4869  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4870  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
4871  ret <16 x i8> %res
4872}
4873
4874define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
4875; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
4876; GENERIC:       # %bb.0:
4877; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4878; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
4879; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
4880; GENERIC-NEXT:    retq # sched: [1:1.00]
4881;
4882; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
4883; SKX:       # %bb.0:
4884; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
4885; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
4886; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
4887; SKX-NEXT:    retq # sched: [7:1.00]
4888  %vec = load <16 x i8>, <16 x i8>* %vp
4889  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
4890  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
4891  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
4892  ret <16 x i8> %res
4893}
4894
4895define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
4896; GENERIC-LABEL: test_32xi8_perm_mask0:
4897; GENERIC:       # %bb.0:
4898; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
4899; GENERIC-NEXT:    retq # sched: [1:1.00]
4900;
4901; SKX-LABEL: test_32xi8_perm_mask0:
4902; SKX:       # %bb.0:
4903; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
4904; SKX-NEXT:    retq # sched: [7:1.00]
4905  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
4906  ret <32 x i8> %res
4907}
4908define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
4909; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
4910; GENERIC:       # %bb.0:
4911; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
4912; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
4913; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
4914; GENERIC-NEXT:    retq # sched: [1:1.00]
4915;
4916; SKX-LABEL: test_masked_32xi8_perm_mask0:
4917; SKX:       # %bb.0:
4918; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
4919; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
4920; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
4921; SKX-NEXT:    retq # sched: [7:1.00]
4922  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
4923  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
4924  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
4925  ret <32 x i8> %res
4926}
4927
4928define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
4929; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
4930; GENERIC:       # %bb.0:
4931; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
4932; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
4933; GENERIC-NEXT:    retq # sched: [1:1.00]
4934;
4935; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
4936; SKX:       # %bb.0:
4937; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
4938; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
4939; SKX-NEXT:    retq # sched: [7:1.00]
4940  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
4941  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
4942  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
4943  ret <32 x i8> %res
4944}
4945define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
4946; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
4947; GENERIC:       # %bb.0:
4948; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
4949; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
4950; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
4951; GENERIC-NEXT:    retq # sched: [1:1.00]
4952;
4953; SKX-LABEL: test_masked_32xi8_perm_mask1:
4954; SKX:       # %bb.0:
4955; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
4956; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
4957; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
4958; SKX-NEXT:    retq # sched: [7:1.00]
4959  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
4960  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
4961  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
4962  ret <32 x i8> %res
4963}
4964
4965define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
4966; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
4967; GENERIC:       # %bb.0:
4968; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
4969; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
4970; GENERIC-NEXT:    retq # sched: [1:1.00]
4971;
4972; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
4973; SKX:       # %bb.0:
4974; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
4975; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
4976; SKX-NEXT:    retq # sched: [7:1.00]
4977  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
4978  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
4979  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
4980  ret <32 x i8> %res
4981}
4982define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
4983; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
4984; GENERIC:       # %bb.0:
4985; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
4986; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
4987; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
4988; GENERIC-NEXT:    retq # sched: [1:1.00]
4989;
4990; SKX-LABEL: test_masked_32xi8_perm_mask2:
4991; SKX:       # %bb.0:
4992; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
4993; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
4994; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
4995; SKX-NEXT:    retq # sched: [7:1.00]
4996  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
4997  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
4998  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
4999  ret <32 x i8> %res
5000}
5001
5002define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
5003; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
5004; GENERIC:       # %bb.0:
5005; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
5006; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
5007; GENERIC-NEXT:    retq # sched: [1:1.00]
5008;
5009; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
5010; SKX:       # %bb.0:
5011; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
5012; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
5013; SKX-NEXT:    retq # sched: [7:1.00]
5014  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
5015  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5016  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
5017  ret <32 x i8> %res
5018}
5019define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
5020; GENERIC-LABEL: test_32xi8_perm_mask3:
5021; GENERIC:       # %bb.0:
5022; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
5023; GENERIC-NEXT:    retq # sched: [1:1.00]
5024;
5025; SKX-LABEL: test_32xi8_perm_mask3:
5026; SKX:       # %bb.0:
5027; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
5028; SKX-NEXT:    retq # sched: [7:1.00]
5029  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
5030  ret <32 x i8> %res
5031}
5032define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
5033; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
5034; GENERIC:       # %bb.0:
5035; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
5036; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
5037; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
5038; GENERIC-NEXT:    retq # sched: [1:1.00]
5039;
5040; SKX-LABEL: test_masked_32xi8_perm_mask3:
5041; SKX:       # %bb.0:
5042; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
5043; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
5044; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
5045; SKX-NEXT:    retq # sched: [7:1.00]
5046  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
5047  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5048  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
5049  ret <32 x i8> %res
5050}
5051
5052define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
5053; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
5054; GENERIC:       # %bb.0:
5055; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
5056; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
5057; GENERIC-NEXT:    retq # sched: [1:1.00]
5058;
5059; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
5060; SKX:       # %bb.0:
5061; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
5062; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
5063; SKX-NEXT:    retq # sched: [7:1.00]
5064  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
5065  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5066  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
5067  ret <32 x i8> %res
5068}
5069define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
5070; GENERIC-LABEL: test_32xi8_perm_mem_mask0:
5071; GENERIC:       # %bb.0:
5072; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
5073; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
5074; GENERIC-NEXT:    retq # sched: [1:1.00]
5075;
5076; SKX-LABEL: test_32xi8_perm_mem_mask0:
5077; SKX:       # %bb.0:
5078; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
5079; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
5080; SKX-NEXT:    retq # sched: [7:1.00]
5081  %vec = load <32 x i8>, <32 x i8>* %vp
5082  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
5083  ret <32 x i8> %res
5084}
5085define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
5086; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
5087; GENERIC:       # %bb.0:
5088; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5089; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
5090; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
5091; GENERIC-NEXT:    retq # sched: [1:1.00]
5092;
5093; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
5094; SKX:       # %bb.0:
5095; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5096; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
5097; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
5098; SKX-NEXT:    retq # sched: [7:1.00]
5099  %vec = load <32 x i8>, <32 x i8>* %vp
5100  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
5101  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5102  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
5103  ret <32 x i8> %res
5104}
5105
5106define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
5107; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
5108; GENERIC:       # %bb.0:
5109; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5110; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
5111; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
5112; GENERIC-NEXT:    retq # sched: [1:1.00]
5113;
5114; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
5115; SKX:       # %bb.0:
5116; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5117; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
5118; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
5119; SKX-NEXT:    retq # sched: [7:1.00]
5120  %vec = load <32 x i8>, <32 x i8>* %vp
5121  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
5122  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5123  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
5124  ret <32 x i8> %res
5125}
5126
5127define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
5128; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
5129; GENERIC:       # %bb.0:
5130; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5131; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
5132; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
5133; GENERIC-NEXT:    retq # sched: [1:1.00]
5134;
5135; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
5136; SKX:       # %bb.0:
5137; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5138; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
5139; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
5140; SKX-NEXT:    retq # sched: [7:1.00]
5141  %vec = load <32 x i8>, <32 x i8>* %vp
5142  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
5143  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5144  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
5145  ret <32 x i8> %res
5146}
5147
5148define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
5149; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
5150; GENERIC:       # %bb.0:
5151; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5152; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
5153; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
5154; GENERIC-NEXT:    retq # sched: [1:1.00]
5155;
5156; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
5157; SKX:       # %bb.0:
5158; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5159; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
5160; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
5161; SKX-NEXT:    retq # sched: [7:1.00]
5162  %vec = load <32 x i8>, <32 x i8>* %vp
5163  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
5164  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5165  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
5166  ret <32 x i8> %res
5167}
5168
5169define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
5170; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
5171; GENERIC:       # %bb.0:
5172; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5173; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
5174; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
5175; GENERIC-NEXT:    retq # sched: [1:1.00]
5176;
5177; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
5178; SKX:       # %bb.0:
5179; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5180; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
5181; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
5182; SKX-NEXT:    retq # sched: [7:1.00]
5183  %vec = load <32 x i8>, <32 x i8>* %vp
5184  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
5185  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5186  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
5187  ret <32 x i8> %res
5188}
5189
5190define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
5191; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
5192; GENERIC:       # %bb.0:
5193; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5194; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
5195; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
5196; GENERIC-NEXT:    retq # sched: [1:1.00]
5197;
5198; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
5199; SKX:       # %bb.0:
5200; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5201; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
5202; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
5203; SKX-NEXT:    retq # sched: [7:1.00]
5204  %vec = load <32 x i8>, <32 x i8>* %vp
5205  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
5206  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5207  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
5208  ret <32 x i8> %res
5209}
5210
5211define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
5212; GENERIC-LABEL: test_32xi8_perm_mem_mask3:
5213; GENERIC:       # %bb.0:
5214; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
5215; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
5216; GENERIC-NEXT:    retq # sched: [1:1.00]
5217;
5218; SKX-LABEL: test_32xi8_perm_mem_mask3:
5219; SKX:       # %bb.0:
5220; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
5221; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
5222; SKX-NEXT:    retq # sched: [7:1.00]
5223  %vec = load <32 x i8>, <32 x i8>* %vp
5224  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
5225  ret <32 x i8> %res
5226}
5227define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
5228; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
5229; GENERIC:       # %bb.0:
5230; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5231; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
5232; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
5233; GENERIC-NEXT:    retq # sched: [1:1.00]
5234;
5235; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
5236; SKX:       # %bb.0:
5237; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
5238; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
5239; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
5240; SKX-NEXT:    retq # sched: [7:1.00]
5241  %vec = load <32 x i8>, <32 x i8>* %vp
5242  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
5243  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5244  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
5245  ret <32 x i8> %res
5246}
5247
5248define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
5249; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
5250; GENERIC:       # %bb.0:
5251; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5252; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
5253; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
5254; GENERIC-NEXT:    retq # sched: [1:1.00]
5255;
5256; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
5257; SKX:       # %bb.0:
5258; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
5259; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
5260; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
5261; SKX-NEXT:    retq # sched: [7:1.00]
5262  %vec = load <32 x i8>, <32 x i8>* %vp
5263  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
5264  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
5265  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
5266  ret <32 x i8> %res
5267}
5268
5269define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
5270; GENERIC-LABEL: test_64xi8_perm_mask0:
5271; GENERIC:       # %bb.0:
5272; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
5273; GENERIC-NEXT:    retq # sched: [1:1.00]
5274;
5275; SKX-LABEL: test_64xi8_perm_mask0:
5276; SKX:       # %bb.0:
5277; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
5278; SKX-NEXT:    retq # sched: [7:1.00]
5279  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
5280  ret <64 x i8> %res
5281}
5282define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
5283; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
5284; GENERIC:       # %bb.0:
5285; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
5286; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
5287; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
5288; GENERIC-NEXT:    retq # sched: [1:1.00]
5289;
5290; SKX-LABEL: test_masked_64xi8_perm_mask0:
5291; SKX:       # %bb.0:
5292; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
5293; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
5294; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
5295; SKX-NEXT:    retq # sched: [7:1.00]
5296  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
5297  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5298  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5299  ret <64 x i8> %res
5300}
5301
5302define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
5303; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
5304; GENERIC:       # %bb.0:
5305; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5306; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
5307; GENERIC-NEXT:    retq # sched: [1:1.00]
5308;
5309; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
5310; SKX:       # %bb.0:
5311; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5312; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
5313; SKX-NEXT:    retq # sched: [7:1.00]
5314  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
5315  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5316  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5317  ret <64 x i8> %res
5318}
5319define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
5320; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
5321; GENERIC:       # %bb.0:
5322; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
5323; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
5324; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
5325; GENERIC-NEXT:    retq # sched: [1:1.00]
5326;
5327; SKX-LABEL: test_masked_64xi8_perm_mask1:
5328; SKX:       # %bb.0:
5329; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
5330; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
5331; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
5332; SKX-NEXT:    retq # sched: [7:1.00]
5333  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
5334  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5335  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5336  ret <64 x i8> %res
5337}
5338
5339define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
5340; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
5341; GENERIC:       # %bb.0:
5342; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5343; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
5344; GENERIC-NEXT:    retq # sched: [1:1.00]
5345;
5346; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
5347; SKX:       # %bb.0:
5348; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5349; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
5350; SKX-NEXT:    retq # sched: [7:1.00]
5351  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
5352  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5353  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5354  ret <64 x i8> %res
5355}
5356define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
5357; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
5358; GENERIC:       # %bb.0:
5359; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
5360; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
5361; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
5362; GENERIC-NEXT:    retq # sched: [1:1.00]
5363;
5364; SKX-LABEL: test_masked_64xi8_perm_mask2:
5365; SKX:       # %bb.0:
5366; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
5367; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
5368; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
5369; SKX-NEXT:    retq # sched: [7:1.00]
5370  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
5371  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5372  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5373  ret <64 x i8> %res
5374}
5375
5376define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
5377; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
5378; GENERIC:       # %bb.0:
5379; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5380; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
5381; GENERIC-NEXT:    retq # sched: [1:1.00]
5382;
5383; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
5384; SKX:       # %bb.0:
5385; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5386; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
5387; SKX-NEXT:    retq # sched: [7:1.00]
5388  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
5389  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5390  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5391  ret <64 x i8> %res
5392}
5393define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
5394; GENERIC-LABEL: test_64xi8_perm_mask3:
5395; GENERIC:       # %bb.0:
5396; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
5397; GENERIC-NEXT:    retq # sched: [1:1.00]
5398;
5399; SKX-LABEL: test_64xi8_perm_mask3:
5400; SKX:       # %bb.0:
5401; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
5402; SKX-NEXT:    retq # sched: [7:1.00]
5403  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
5404  ret <64 x i8> %res
5405}
5406define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
5407; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
5408; GENERIC:       # %bb.0:
5409; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
5410; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
5411; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
5412; GENERIC-NEXT:    retq # sched: [1:1.00]
5413;
5414; SKX-LABEL: test_masked_64xi8_perm_mask3:
5415; SKX:       # %bb.0:
5416; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
5417; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
5418; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
5419; SKX-NEXT:    retq # sched: [7:1.00]
5420  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
5421  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5422  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5423  ret <64 x i8> %res
5424}
5425
5426define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
5427; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
5428; GENERIC:       # %bb.0:
5429; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5430; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
5431; GENERIC-NEXT:    retq # sched: [1:1.00]
5432;
5433; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
5434; SKX:       # %bb.0:
5435; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5436; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
5437; SKX-NEXT:    retq # sched: [7:1.00]
5438  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
5439  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5440  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5441  ret <64 x i8> %res
5442}
5443define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
5444; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
5445; GENERIC:       # %bb.0:
5446; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
5447; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
5448; GENERIC-NEXT:    retq # sched: [1:1.00]
5449;
5450; SKX-LABEL: test_64xi8_perm_mem_mask0:
5451; SKX:       # %bb.0:
5452; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
5453; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
5454; SKX-NEXT:    retq # sched: [7:1.00]
5455  %vec = load <64 x i8>, <64 x i8>* %vp
5456  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
5457  ret <64 x i8> %res
5458}
5459define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
5460; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
5461; GENERIC:       # %bb.0:
5462; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
5463; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5464; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
5465; GENERIC-NEXT:    retq # sched: [1:1.00]
5466;
5467; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
5468; SKX:       # %bb.0:
5469; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
5470; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5471; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
5472; SKX-NEXT:    retq # sched: [7:1.00]
5473  %vec = load <64 x i8>, <64 x i8>* %vp
5474  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
5475  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5476  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5477  ret <64 x i8> %res
5478}
5479
5480define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
5481; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
5482; GENERIC:       # %bb.0:
5483; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
5484; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
5485; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
5486; GENERIC-NEXT:    retq # sched: [1:1.00]
5487;
5488; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
5489; SKX:       # %bb.0:
5490; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
5491; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
5492; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
5493; SKX-NEXT:    retq # sched: [7:1.00]
5494  %vec = load <64 x i8>, <64 x i8>* %vp
5495  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
5496  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5497  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5498  ret <64 x i8> %res
5499}
5500
5501define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
5502; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
5503; GENERIC:       # %bb.0:
5504; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
5505; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5506; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
5507; GENERIC-NEXT:    retq # sched: [1:1.00]
5508;
5509; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
5510; SKX:       # %bb.0:
5511; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
5512; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5513; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
5514; SKX-NEXT:    retq # sched: [7:1.00]
5515  %vec = load <64 x i8>, <64 x i8>* %vp
5516  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
5517  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5518  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5519  ret <64 x i8> %res
5520}
5521
5522define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
5523; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
5524; GENERIC:       # %bb.0:
5525; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
5526; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
5527; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
5528; GENERIC-NEXT:    retq # sched: [1:1.00]
5529;
5530; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
5531; SKX:       # %bb.0:
5532; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
5533; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
5534; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
5535; SKX-NEXT:    retq # sched: [7:1.00]
5536  %vec = load <64 x i8>, <64 x i8>* %vp
5537  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
5538  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5539  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5540  ret <64 x i8> %res
5541}
5542
5543define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
5544; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
5545; GENERIC:       # %bb.0:
5546; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
5547; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5548; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
5549; GENERIC-NEXT:    retq # sched: [1:1.00]
5550;
5551; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
5552; SKX:       # %bb.0:
5553; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
5554; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5555; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
5556; SKX-NEXT:    retq # sched: [7:1.00]
5557  %vec = load <64 x i8>, <64 x i8>* %vp
5558  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
5559  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5560  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5561  ret <64 x i8> %res
5562}
5563
5564define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
5565; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
5566; GENERIC:       # %bb.0:
5567; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
5568; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
5569; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
5570; GENERIC-NEXT:    retq # sched: [1:1.00]
5571;
5572; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
5573; SKX:       # %bb.0:
5574; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
5575; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
5576; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
5577; SKX-NEXT:    retq # sched: [7:1.00]
5578  %vec = load <64 x i8>, <64 x i8>* %vp
5579  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
5580  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5581  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5582  ret <64 x i8> %res
5583}
5584
5585define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
5586; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
5587; GENERIC:       # %bb.0:
5588; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
5589; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
5590; GENERIC-NEXT:    retq # sched: [1:1.00]
5591;
5592; SKX-LABEL: test_64xi8_perm_mem_mask3:
5593; SKX:       # %bb.0:
5594; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
5595; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
5596; SKX-NEXT:    retq # sched: [7:1.00]
5597  %vec = load <64 x i8>, <64 x i8>* %vp
5598  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
5599  ret <64 x i8> %res
5600}
5601define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
5602; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
5603; GENERIC:       # %bb.0:
5604; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
5605; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
5606; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
5607; GENERIC-NEXT:    retq # sched: [1:1.00]
5608;
5609; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
5610; SKX:       # %bb.0:
5611; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
5612; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
5613; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
5614; SKX-NEXT:    retq # sched: [7:1.00]
5615  %vec = load <64 x i8>, <64 x i8>* %vp
5616  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
5617  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5618  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
5619  ret <64 x i8> %res
5620}
5621
5622define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
5623; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
5624; GENERIC:       # %bb.0:
5625; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
5626; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
5627; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
5628; GENERIC-NEXT:    retq # sched: [1:1.00]
5629;
5630; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
5631; SKX:       # %bb.0:
5632; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
5633; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
5634; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
5635; SKX-NEXT:    retq # sched: [7:1.00]
5636  %vec = load <64 x i8>, <64 x i8>* %vp
5637  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
5638  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
5639  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
5640  ret <64 x i8> %res
5641}
5642
5643define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
5644; GENERIC-LABEL: test_8xi16_perm_high_mask0:
5645; GENERIC:       # %bb.0:
5646; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
5647; GENERIC-NEXT:    retq # sched: [1:1.00]
5648;
5649; SKX-LABEL: test_8xi16_perm_high_mask0:
5650; SKX:       # %bb.0:
5651; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
5652; SKX-NEXT:    retq # sched: [7:1.00]
5653  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
5654  ret <8 x i16> %res
5655}
5656define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5657; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
5658; GENERIC:       # %bb.0:
5659; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5660; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
5661; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5662; GENERIC-NEXT:    retq # sched: [1:1.00]
5663;
5664; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
5665; SKX:       # %bb.0:
5666; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5667; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
5668; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5669; SKX-NEXT:    retq # sched: [7:1.00]
5670  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
5671  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5672  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5673  ret <8 x i16> %res
5674}
5675
5676define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
5677; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
5678; GENERIC:       # %bb.0:
5679; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5680; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
5681; GENERIC-NEXT:    retq # sched: [1:1.00]
5682;
5683; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
5684; SKX:       # %bb.0:
5685; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5686; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
5687; SKX-NEXT:    retq # sched: [7:1.00]
5688  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
5689  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5690  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5691  ret <8 x i16> %res
5692}
5693define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5694; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
5695; GENERIC:       # %bb.0:
5696; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5697; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
5698; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5699; GENERIC-NEXT:    retq # sched: [1:1.00]
5700;
5701; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
5702; SKX:       # %bb.0:
5703; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5704; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
5705; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5706; SKX-NEXT:    retq # sched: [7:1.00]
5707  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
5708  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5709  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5710  ret <8 x i16> %res
5711}
5712
5713define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
5714; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
5715; GENERIC:       # %bb.0:
5716; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5717; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
5718; GENERIC-NEXT:    retq # sched: [1:1.00]
5719;
5720; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
5721; SKX:       # %bb.0:
5722; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5723; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
5724; SKX-NEXT:    retq # sched: [7:1.00]
5725  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
5726  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5727  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5728  ret <8 x i16> %res
5729}
5730define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5731; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
5732; GENERIC:       # %bb.0:
5733; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5734; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
5735; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5736; GENERIC-NEXT:    retq # sched: [1:1.00]
5737;
5738; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
5739; SKX:       # %bb.0:
5740; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5741; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
5742; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5743; SKX-NEXT:    retq # sched: [7:1.00]
5744  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
5745  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5746  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5747  ret <8 x i16> %res
5748}
5749
5750define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
5751; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
5752; GENERIC:       # %bb.0:
5753; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5754; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
5755; GENERIC-NEXT:    retq # sched: [1:1.00]
5756;
5757; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
5758; SKX:       # %bb.0:
5759; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5760; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
5761; SKX-NEXT:    retq # sched: [7:1.00]
5762  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
5763  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5764  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5765  ret <8 x i16> %res
5766}
5767define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
5768; GENERIC-LABEL: test_8xi16_perm_low_mask3:
5769; GENERIC:       # %bb.0:
5770; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
5771; GENERIC-NEXT:    retq # sched: [1:1.00]
5772;
5773; SKX-LABEL: test_8xi16_perm_low_mask3:
5774; SKX:       # %bb.0:
5775; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
5776; SKX-NEXT:    retq # sched: [7:1.00]
5777  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
5778  ret <8 x i16> %res
5779}
5780define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5781; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
5782; GENERIC:       # %bb.0:
5783; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5784; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
5785; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5786; GENERIC-NEXT:    retq # sched: [1:1.00]
5787;
5788; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
5789; SKX:       # %bb.0:
5790; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5791; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
5792; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5793; SKX-NEXT:    retq # sched: [7:1.00]
5794  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
5795  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5796  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5797  ret <8 x i16> %res
5798}
5799
5800define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
5801; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
5802; GENERIC:       # %bb.0:
5803; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5804; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
5805; GENERIC-NEXT:    retq # sched: [1:1.00]
5806;
5807; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
5808; SKX:       # %bb.0:
5809; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5810; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
5811; SKX-NEXT:    retq # sched: [7:1.00]
5812  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
5813  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5814  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5815  ret <8 x i16> %res
5816}
5817define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5818; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
5819; GENERIC:       # %bb.0:
5820; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5821; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
5822; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5823; GENERIC-NEXT:    retq # sched: [1:1.00]
5824;
5825; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
5826; SKX:       # %bb.0:
5827; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5828; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
5829; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5830; SKX-NEXT:    retq # sched: [7:1.00]
5831  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
5832  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5833  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5834  ret <8 x i16> %res
5835}
5836
5837define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
5838; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
5839; GENERIC:       # %bb.0:
5840; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5841; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
5842; GENERIC-NEXT:    retq # sched: [1:1.00]
5843;
5844; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
5845; SKX:       # %bb.0:
5846; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5847; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
5848; SKX-NEXT:    retq # sched: [7:1.00]
5849  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
5850  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5851  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5852  ret <8 x i16> %res
5853}
5854define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5855; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
5856; GENERIC:       # %bb.0:
5857; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5858; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
5859; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5860; GENERIC-NEXT:    retq # sched: [1:1.00]
5861;
5862; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
5863; SKX:       # %bb.0:
5864; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5865; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
5866; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5867; SKX-NEXT:    retq # sched: [7:1.00]
5868  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
5869  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5870  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5871  ret <8 x i16> %res
5872}
5873
5874define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
5875; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
5876; GENERIC:       # %bb.0:
5877; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5878; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
5879; GENERIC-NEXT:    retq # sched: [1:1.00]
5880;
5881; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
5882; SKX:       # %bb.0:
5883; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5884; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
5885; SKX-NEXT:    retq # sched: [7:1.00]
5886  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
5887  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5888  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5889  ret <8 x i16> %res
5890}
5891define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
5892; GENERIC-LABEL: test_8xi16_perm_high_mask6:
5893; GENERIC:       # %bb.0:
5894; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
5895; GENERIC-NEXT:    retq # sched: [1:1.00]
5896;
5897; SKX-LABEL: test_8xi16_perm_high_mask6:
5898; SKX:       # %bb.0:
5899; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
5900; SKX-NEXT:    retq # sched: [7:1.00]
5901  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
5902  ret <8 x i16> %res
5903}
5904define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5905; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
5906; GENERIC:       # %bb.0:
5907; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5908; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
5909; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5910; GENERIC-NEXT:    retq # sched: [1:1.00]
5911;
5912; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
5913; SKX:       # %bb.0:
5914; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5915; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
5916; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5917; SKX-NEXT:    retq # sched: [7:1.00]
5918  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
5919  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5920  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5921  ret <8 x i16> %res
5922}
5923
5924define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
5925; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
5926; GENERIC:       # %bb.0:
5927; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5928; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
5929; GENERIC-NEXT:    retq # sched: [1:1.00]
5930;
5931; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
5932; SKX:       # %bb.0:
5933; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5934; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
5935; SKX-NEXT:    retq # sched: [7:1.00]
5936  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
5937  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5938  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5939  ret <8 x i16> %res
5940}
5941define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
5942; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
5943; GENERIC:       # %bb.0:
5944; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
5945; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
5946; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5947; GENERIC-NEXT:    retq # sched: [1:1.00]
5948;
5949; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
5950; SKX:       # %bb.0:
5951; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
5952; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
5953; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
5954; SKX-NEXT:    retq # sched: [7:1.00]
5955  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
5956  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5957  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
5958  ret <8 x i16> %res
5959}
5960
5961define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
5962; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
5963; GENERIC:       # %bb.0:
5964; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5965; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
5966; GENERIC-NEXT:    retq # sched: [1:1.00]
5967;
5968; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
5969; SKX:       # %bb.0:
5970; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
5971; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
5972; SKX-NEXT:    retq # sched: [7:1.00]
5973  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
5974  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
5975  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
5976  ret <8 x i16> %res
5977}
5978define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
5979; GENERIC-LABEL: test_8xi16_perm_high_mem_mask0:
5980; GENERIC:       # %bb.0:
5981; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
5982; GENERIC-NEXT:    retq # sched: [1:1.00]
5983;
5984; SKX-LABEL: test_8xi16_perm_high_mem_mask0:
5985; SKX:       # %bb.0:
5986; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
5987; SKX-NEXT:    retq # sched: [7:1.00]
5988  %vec = load <8 x i16>, <8 x i16>* %vp
5989  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
5990  ret <8 x i16> %res
5991}
5992define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
5993; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
5994; GENERIC:       # %bb.0:
5995; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
5996; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
5997; GENERIC-NEXT:    retq # sched: [1:1.00]
5998;
5999; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
6000; SKX:       # %bb.0:
6001; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6002; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
6003; SKX-NEXT:    retq # sched: [7:1.00]
6004  %vec = load <8 x i16>, <8 x i16>* %vp
6005  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
6006  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6007  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6008  ret <8 x i16> %res
6009}
6010
6011define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
6012; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
6013; GENERIC:       # %bb.0:
6014; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6015; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
6016; GENERIC-NEXT:    retq # sched: [1:1.00]
6017;
6018; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
6019; SKX:       # %bb.0:
6020; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6021; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
6022; SKX-NEXT:    retq # sched: [7:1.00]
6023  %vec = load <8 x i16>, <8 x i16>* %vp
6024  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
6025  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6026  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6027  ret <8 x i16> %res
6028}
6029
6030define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
6031; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
6032; GENERIC:       # %bb.0:
6033; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
6034; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
6035; GENERIC-NEXT:    retq # sched: [1:1.00]
6036;
6037; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
6038; SKX:       # %bb.0:
6039; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6040; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
6041; SKX-NEXT:    retq # sched: [7:1.00]
6042  %vec = load <8 x i16>, <8 x i16>* %vp
6043  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
6044  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6045  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6046  ret <8 x i16> %res
6047}
6048
6049define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
6050; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
6051; GENERIC:       # %bb.0:
6052; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6053; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
6054; GENERIC-NEXT:    retq # sched: [1:1.00]
6055;
6056; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
6057; SKX:       # %bb.0:
6058; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6059; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
6060; SKX-NEXT:    retq # sched: [7:1.00]
6061  %vec = load <8 x i16>, <8 x i16>* %vp
6062  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
6063  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6064  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6065  ret <8 x i16> %res
6066}
6067
6068define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
6069; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
6070; GENERIC:       # %bb.0:
6071; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
6072; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
6073; GENERIC-NEXT:    retq # sched: [1:1.00]
6074;
6075; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
6076; SKX:       # %bb.0:
6077; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6078; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
6079; SKX-NEXT:    retq # sched: [7:1.00]
6080  %vec = load <8 x i16>, <8 x i16>* %vp
6081  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
6082  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6083  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6084  ret <8 x i16> %res
6085}
6086
6087define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
6088; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
6089; GENERIC:       # %bb.0:
6090; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6091; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
6092; GENERIC-NEXT:    retq # sched: [1:1.00]
6093;
6094; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
6095; SKX:       # %bb.0:
6096; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6097; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
6098; SKX-NEXT:    retq # sched: [7:1.00]
6099  %vec = load <8 x i16>, <8 x i16>* %vp
6100  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
6101  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6102  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6103  ret <8 x i16> %res
6104}
6105
6106define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
6107; GENERIC-LABEL: test_8xi16_perm_low_mem_mask3:
6108; GENERIC:       # %bb.0:
6109; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
6110; GENERIC-NEXT:    retq # sched: [1:1.00]
6111;
6112; SKX-LABEL: test_8xi16_perm_low_mem_mask3:
6113; SKX:       # %bb.0:
6114; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
6115; SKX-NEXT:    retq # sched: [7:1.00]
6116  %vec = load <8 x i16>, <8 x i16>* %vp
6117  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
6118  ret <8 x i16> %res
6119}
6120define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
6121; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
6122; GENERIC:       # %bb.0:
6123; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
6124; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
6125; GENERIC-NEXT:    retq # sched: [1:1.00]
6126;
6127; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
6128; SKX:       # %bb.0:
6129; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6130; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
6131; SKX-NEXT:    retq # sched: [7:1.00]
6132  %vec = load <8 x i16>, <8 x i16>* %vp
6133  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
6134  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6135  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6136  ret <8 x i16> %res
6137}
6138
6139define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
6140; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
6141; GENERIC:       # %bb.0:
6142; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6143; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
6144; GENERIC-NEXT:    retq # sched: [1:1.00]
6145;
6146; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
6147; SKX:       # %bb.0:
6148; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6149; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
6150; SKX-NEXT:    retq # sched: [7:1.00]
6151  %vec = load <8 x i16>, <8 x i16>* %vp
6152  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
6153  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6154  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6155  ret <8 x i16> %res
6156}
6157
6158define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
6159; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
6160; GENERIC:       # %bb.0:
6161; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
6162; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
6163; GENERIC-NEXT:    retq # sched: [1:1.00]
6164;
6165; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
6166; SKX:       # %bb.0:
6167; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6168; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
6169; SKX-NEXT:    retq # sched: [7:1.00]
6170  %vec = load <8 x i16>, <8 x i16>* %vp
6171  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
6172  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6173  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6174  ret <8 x i16> %res
6175}
6176
6177define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
6178; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
6179; GENERIC:       # %bb.0:
6180; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6181; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
6182; GENERIC-NEXT:    retq # sched: [1:1.00]
6183;
6184; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
6185; SKX:       # %bb.0:
6186; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6187; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
6188; SKX-NEXT:    retq # sched: [7:1.00]
6189  %vec = load <8 x i16>, <8 x i16>* %vp
6190  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
6191  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6192  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6193  ret <8 x i16> %res
6194}
6195
6196define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
6197; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
6198; GENERIC:       # %bb.0:
6199; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
6200; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
6201; GENERIC-NEXT:    retq # sched: [1:1.00]
6202;
6203; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
6204; SKX:       # %bb.0:
6205; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6206; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
6207; SKX-NEXT:    retq # sched: [7:1.00]
6208  %vec = load <8 x i16>, <8 x i16>* %vp
6209  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
6210  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6211  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6212  ret <8 x i16> %res
6213}
6214
6215define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
6216; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
6217; GENERIC:       # %bb.0:
6218; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6219; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
6220; GENERIC-NEXT:    retq # sched: [1:1.00]
6221;
6222; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
6223; SKX:       # %bb.0:
6224; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6225; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
6226; SKX-NEXT:    retq # sched: [7:1.00]
6227  %vec = load <8 x i16>, <8 x i16>* %vp
6228  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
6229  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6230  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6231  ret <8 x i16> %res
6232}
6233
6234define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
6235; GENERIC-LABEL: test_8xi16_perm_high_mem_mask6:
6236; GENERIC:       # %bb.0:
6237; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
6238; GENERIC-NEXT:    retq # sched: [1:1.00]
6239;
6240; SKX-LABEL: test_8xi16_perm_high_mem_mask6:
6241; SKX:       # %bb.0:
6242; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
6243; SKX-NEXT:    retq # sched: [7:1.00]
6244  %vec = load <8 x i16>, <8 x i16>* %vp
6245  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
6246  ret <8 x i16> %res
6247}
6248define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
6249; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
6250; GENERIC:       # %bb.0:
6251; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
6252; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
6253; GENERIC-NEXT:    retq # sched: [1:1.00]
6254;
6255; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
6256; SKX:       # %bb.0:
6257; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6258; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
6259; SKX-NEXT:    retq # sched: [7:1.00]
6260  %vec = load <8 x i16>, <8 x i16>* %vp
6261  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
6262  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6263  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6264  ret <8 x i16> %res
6265}
6266
6267define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
6268; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
6269; GENERIC:       # %bb.0:
6270; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6271; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
6272; GENERIC-NEXT:    retq # sched: [1:1.00]
6273;
6274; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
6275; SKX:       # %bb.0:
6276; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6277; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
6278; SKX-NEXT:    retq # sched: [7:1.00]
6279  %vec = load <8 x i16>, <8 x i16>* %vp
6280  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
6281  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6282  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6283  ret <8 x i16> %res
6284}
6285
6286define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
6287; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
6288; GENERIC:       # %bb.0:
6289; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
6290; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
6291; GENERIC-NEXT:    retq # sched: [1:1.00]
6292;
6293; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
6294; SKX:       # %bb.0:
6295; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
6296; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
6297; SKX-NEXT:    retq # sched: [7:1.00]
6298  %vec = load <8 x i16>, <8 x i16>* %vp
6299  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
6300  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6301  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
6302  ret <8 x i16> %res
6303}
6304
6305define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
6306; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
6307; GENERIC:       # %bb.0:
6308; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
6309; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
6310; GENERIC-NEXT:    retq # sched: [1:1.00]
6311;
6312; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
6313; SKX:       # %bb.0:
6314; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
6315; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
6316; SKX-NEXT:    retq # sched: [7:1.00]
6317  %vec = load <8 x i16>, <8 x i16>* %vp
6318  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
6319  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
6320  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
6321  ret <8 x i16> %res
6322}
6323
6324define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
6325; GENERIC-LABEL: test_16xi16_perm_high_mask0:
6326; GENERIC:       # %bb.0:
6327; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
6328; GENERIC-NEXT:    retq # sched: [1:1.00]
6329;
6330; SKX-LABEL: test_16xi16_perm_high_mask0:
6331; SKX:       # %bb.0:
6332; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
6333; SKX-NEXT:    retq # sched: [7:1.00]
6334  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
6335  ret <16 x i16> %res
6336}
6337define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6338; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
6339; GENERIC:       # %bb.0:
6340; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6341; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
6342; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6343; GENERIC-NEXT:    retq # sched: [1:1.00]
6344;
6345; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
6346; SKX:       # %bb.0:
6347; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6348; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
6349; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6350; SKX-NEXT:    retq # sched: [7:1.00]
6351  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
6352  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6353  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6354  ret <16 x i16> %res
6355}
6356
6357define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
6358; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
6359; GENERIC:       # %bb.0:
6360; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6361; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
6362; GENERIC-NEXT:    retq # sched: [1:1.00]
6363;
6364; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
6365; SKX:       # %bb.0:
6366; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6367; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
6368; SKX-NEXT:    retq # sched: [7:1.00]
6369  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
6370  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6371  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6372  ret <16 x i16> %res
6373}
6374define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6375; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
6376; GENERIC:       # %bb.0:
6377; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6378; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
6379; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6380; GENERIC-NEXT:    retq # sched: [1:1.00]
6381;
6382; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
6383; SKX:       # %bb.0:
6384; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6385; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
6386; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6387; SKX-NEXT:    retq # sched: [7:1.00]
6388  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6389  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6390  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6391  ret <16 x i16> %res
6392}
6393
6394define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
6395; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
6396; GENERIC:       # %bb.0:
6397; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6398; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
6399; GENERIC-NEXT:    retq # sched: [1:1.00]
6400;
6401; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
6402; SKX:       # %bb.0:
6403; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6404; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
6405; SKX-NEXT:    retq # sched: [7:1.00]
6406  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6407  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6408  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6409  ret <16 x i16> %res
6410}
6411define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6412; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
6413; GENERIC:       # %bb.0:
6414; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6415; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
6416; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6417; GENERIC-NEXT:    retq # sched: [1:1.00]
6418;
6419; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
6420; SKX:       # %bb.0:
6421; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6422; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
6423; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6424; SKX-NEXT:    retq # sched: [7:1.00]
6425  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
6426  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6427  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6428  ret <16 x i16> %res
6429}
6430
6431define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
6432; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
6433; GENERIC:       # %bb.0:
6434; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6435; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
6436; GENERIC-NEXT:    retq # sched: [1:1.00]
6437;
6438; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
6439; SKX:       # %bb.0:
6440; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6441; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
6442; SKX-NEXT:    retq # sched: [7:1.00]
6443  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
6444  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6445  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6446  ret <16 x i16> %res
6447}
6448define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
6449; GENERIC-LABEL: test_16xi16_perm_low_mask3:
6450; GENERIC:       # %bb.0:
6451; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
6452; GENERIC-NEXT:    retq # sched: [1:1.00]
6453;
6454; SKX-LABEL: test_16xi16_perm_low_mask3:
6455; SKX:       # %bb.0:
6456; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
6457; SKX-NEXT:    retq # sched: [7:1.00]
6458  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6459  ret <16 x i16> %res
6460}
6461define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6462; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
6463; GENERIC:       # %bb.0:
6464; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6465; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
6466; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6467; GENERIC-NEXT:    retq # sched: [1:1.00]
6468;
6469; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
6470; SKX:       # %bb.0:
6471; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6472; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
6473; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6474; SKX-NEXT:    retq # sched: [7:1.00]
6475  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6476  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6477  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6478  ret <16 x i16> %res
6479}
6480
6481define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
6482; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
6483; GENERIC:       # %bb.0:
6484; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6485; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
6486; GENERIC-NEXT:    retq # sched: [1:1.00]
6487;
6488; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
6489; SKX:       # %bb.0:
6490; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6491; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
6492; SKX-NEXT:    retq # sched: [7:1.00]
6493  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6494  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6495  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6496  ret <16 x i16> %res
6497}
6498define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6499; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
6500; GENERIC:       # %bb.0:
6501; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6502; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
6503; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6504; GENERIC-NEXT:    retq # sched: [1:1.00]
6505;
6506; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
6507; SKX:       # %bb.0:
6508; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6509; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
6510; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6511; SKX-NEXT:    retq # sched: [7:1.00]
6512  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
6513  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6514  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6515  ret <16 x i16> %res
6516}
6517
6518define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
6519; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
6520; GENERIC:       # %bb.0:
6521; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6522; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
6523; GENERIC-NEXT:    retq # sched: [1:1.00]
6524;
6525; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
6526; SKX:       # %bb.0:
6527; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6528; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
6529; SKX-NEXT:    retq # sched: [7:1.00]
6530  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
6531  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6532  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6533  ret <16 x i16> %res
6534}
6535define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6536; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
6537; GENERIC:       # %bb.0:
6538; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6539; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
6540; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6541; GENERIC-NEXT:    retq # sched: [1:1.00]
6542;
6543; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
6544; SKX:       # %bb.0:
6545; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6546; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
6547; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6548; SKX-NEXT:    retq # sched: [7:1.00]
6549  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
6550  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6551  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6552  ret <16 x i16> %res
6553}
6554
6555define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
6556; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
6557; GENERIC:       # %bb.0:
6558; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6559; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
6560; GENERIC-NEXT:    retq # sched: [1:1.00]
6561;
6562; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
6563; SKX:       # %bb.0:
6564; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6565; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
6566; SKX-NEXT:    retq # sched: [7:1.00]
6567  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
6568  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6569  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6570  ret <16 x i16> %res
6571}
6572define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
6573; GENERIC-LABEL: test_16xi16_perm_high_mask6:
6574; GENERIC:       # %bb.0:
6575; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
6576; GENERIC-NEXT:    retq # sched: [1:1.00]
6577;
6578; SKX-LABEL: test_16xi16_perm_high_mask6:
6579; SKX:       # %bb.0:
6580; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
6581; SKX-NEXT:    retq # sched: [7:1.00]
6582  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
6583  ret <16 x i16> %res
6584}
6585define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6586; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
6587; GENERIC:       # %bb.0:
6588; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6589; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
6590; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6591; GENERIC-NEXT:    retq # sched: [1:1.00]
6592;
6593; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
6594; SKX:       # %bb.0:
6595; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6596; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
6597; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6598; SKX-NEXT:    retq # sched: [7:1.00]
6599  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
6600  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6601  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6602  ret <16 x i16> %res
6603}
6604
6605define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
6606; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
6607; GENERIC:       # %bb.0:
6608; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6609; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
6610; GENERIC-NEXT:    retq # sched: [1:1.00]
6611;
6612; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
6613; SKX:       # %bb.0:
6614; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6615; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
6616; SKX-NEXT:    retq # sched: [7:1.00]
6617  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
6618  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6619  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6620  ret <16 x i16> %res
6621}
6622define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
6623; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
6624; GENERIC:       # %bb.0:
6625; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
6626; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
6627; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
6628; GENERIC-NEXT:    retq # sched: [1:1.00]
6629;
6630; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
6631; SKX:       # %bb.0:
6632; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
6633; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
6634; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
6635; SKX-NEXT:    retq # sched: [7:1.00]
6636  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
6637  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6638  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6639  ret <16 x i16> %res
6640}
6641
6642define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
6643; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
6644; GENERIC:       # %bb.0:
6645; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6646; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
6647; GENERIC-NEXT:    retq # sched: [1:1.00]
6648;
6649; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
6650; SKX:       # %bb.0:
6651; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6652; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
6653; SKX-NEXT:    retq # sched: [7:1.00]
6654  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
6655  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6656  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6657  ret <16 x i16> %res
6658}
6659define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
6660; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0:
6661; GENERIC:       # %bb.0:
6662; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
6663; GENERIC-NEXT:    retq # sched: [1:1.00]
6664;
6665; SKX-LABEL: test_16xi16_perm_high_mem_mask0:
6666; SKX:       # %bb.0:
6667; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
6668; SKX-NEXT:    retq # sched: [7:1.00]
6669  %vec = load <16 x i16>, <16 x i16>* %vp
6670  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
6671  ret <16 x i16> %res
6672}
6673define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6674; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
6675; GENERIC:       # %bb.0:
6676; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6677; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
6678; GENERIC-NEXT:    retq # sched: [1:1.00]
6679;
6680; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
6681; SKX:       # %bb.0:
6682; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6683; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
6684; SKX-NEXT:    retq # sched: [7:1.00]
6685  %vec = load <16 x i16>, <16 x i16>* %vp
6686  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
6687  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6688  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6689  ret <16 x i16> %res
6690}
6691
6692define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
6693; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
6694; GENERIC:       # %bb.0:
6695; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6696; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
6697; GENERIC-NEXT:    retq # sched: [1:1.00]
6698;
6699; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
6700; SKX:       # %bb.0:
6701; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6702; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
6703; SKX-NEXT:    retq # sched: [7:1.00]
6704  %vec = load <16 x i16>, <16 x i16>* %vp
6705  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
6706  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6707  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6708  ret <16 x i16> %res
6709}
6710
6711define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6712; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
6713; GENERIC:       # %bb.0:
6714; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6715; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
6716; GENERIC-NEXT:    retq # sched: [1:1.00]
6717;
6718; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
6719; SKX:       # %bb.0:
6720; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6721; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
6722; SKX-NEXT:    retq # sched: [7:1.00]
6723  %vec = load <16 x i16>, <16 x i16>* %vp
6724  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
6725  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6726  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6727  ret <16 x i16> %res
6728}
6729
6730define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
6731; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
6732; GENERIC:       # %bb.0:
6733; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6734; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
6735; GENERIC-NEXT:    retq # sched: [1:1.00]
6736;
6737; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
6738; SKX:       # %bb.0:
6739; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6740; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
6741; SKX-NEXT:    retq # sched: [7:1.00]
6742  %vec = load <16 x i16>, <16 x i16>* %vp
6743  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
6744  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6745  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6746  ret <16 x i16> %res
6747}
6748
6749define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6750; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
6751; GENERIC:       # %bb.0:
6752; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6753; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
6754; GENERIC-NEXT:    retq # sched: [1:1.00]
6755;
6756; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
6757; SKX:       # %bb.0:
6758; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6759; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
6760; SKX-NEXT:    retq # sched: [7:1.00]
6761  %vec = load <16 x i16>, <16 x i16>* %vp
6762  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
6763  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6764  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6765  ret <16 x i16> %res
6766}
6767
6768define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
6769; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
6770; GENERIC:       # %bb.0:
6771; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6772; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
6773; GENERIC-NEXT:    retq # sched: [1:1.00]
6774;
6775; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
6776; SKX:       # %bb.0:
6777; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6778; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
6779; SKX-NEXT:    retq # sched: [7:1.00]
6780  %vec = load <16 x i16>, <16 x i16>* %vp
6781  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
6782  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6783  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6784  ret <16 x i16> %res
6785}
6786
6787define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
6788; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3:
6789; GENERIC:       # %bb.0:
6790; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
6791; GENERIC-NEXT:    retq # sched: [1:1.00]
6792;
6793; SKX-LABEL: test_16xi16_perm_low_mem_mask3:
6794; SKX:       # %bb.0:
6795; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
6796; SKX-NEXT:    retq # sched: [7:1.00]
6797  %vec = load <16 x i16>, <16 x i16>* %vp
6798  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
6799  ret <16 x i16> %res
6800}
6801define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6802; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
6803; GENERIC:       # %bb.0:
6804; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6805; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
6806; GENERIC-NEXT:    retq # sched: [1:1.00]
6807;
6808; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
6809; SKX:       # %bb.0:
6810; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6811; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
6812; SKX-NEXT:    retq # sched: [7:1.00]
6813  %vec = load <16 x i16>, <16 x i16>* %vp
6814  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
6815  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6816  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6817  ret <16 x i16> %res
6818}
6819
6820define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
6821; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
6822; GENERIC:       # %bb.0:
6823; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6824; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
6825; GENERIC-NEXT:    retq # sched: [1:1.00]
6826;
6827; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
6828; SKX:       # %bb.0:
6829; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6830; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
6831; SKX-NEXT:    retq # sched: [7:1.00]
6832  %vec = load <16 x i16>, <16 x i16>* %vp
6833  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
6834  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6835  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6836  ret <16 x i16> %res
6837}
6838
6839define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6840; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
6841; GENERIC:       # %bb.0:
6842; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6843; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
6844; GENERIC-NEXT:    retq # sched: [1:1.00]
6845;
6846; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
6847; SKX:       # %bb.0:
6848; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6849; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
6850; SKX-NEXT:    retq # sched: [7:1.00]
6851  %vec = load <16 x i16>, <16 x i16>* %vp
6852  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
6853  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6854  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6855  ret <16 x i16> %res
6856}
6857
6858define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
6859; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
6860; GENERIC:       # %bb.0:
6861; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6862; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
6863; GENERIC-NEXT:    retq # sched: [1:1.00]
6864;
6865; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
6866; SKX:       # %bb.0:
6867; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6868; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
6869; SKX-NEXT:    retq # sched: [7:1.00]
6870  %vec = load <16 x i16>, <16 x i16>* %vp
6871  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
6872  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6873  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6874  ret <16 x i16> %res
6875}
6876
6877define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6878; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
6879; GENERIC:       # %bb.0:
6880; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6881; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
6882; GENERIC-NEXT:    retq # sched: [1:1.00]
6883;
6884; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
6885; SKX:       # %bb.0:
6886; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6887; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
6888; SKX-NEXT:    retq # sched: [7:1.00]
6889  %vec = load <16 x i16>, <16 x i16>* %vp
6890  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6891  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6892  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6893  ret <16 x i16> %res
6894}
6895
6896define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
6897; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
6898; GENERIC:       # %bb.0:
6899; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6900; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
6901; GENERIC-NEXT:    retq # sched: [1:1.00]
6902;
6903; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
6904; SKX:       # %bb.0:
6905; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6906; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
6907; SKX-NEXT:    retq # sched: [7:1.00]
6908  %vec = load <16 x i16>, <16 x i16>* %vp
6909  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6910  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6911  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6912  ret <16 x i16> %res
6913}
6914
6915define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
6916; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6:
6917; GENERIC:       # %bb.0:
6918; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
6919; GENERIC-NEXT:    retq # sched: [1:1.00]
6920;
6921; SKX-LABEL: test_16xi16_perm_high_mem_mask6:
6922; SKX:       # %bb.0:
6923; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
6924; SKX-NEXT:    retq # sched: [7:1.00]
6925  %vec = load <16 x i16>, <16 x i16>* %vp
6926  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
6927  ret <16 x i16> %res
6928}
6929define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6930; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
6931; GENERIC:       # %bb.0:
6932; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6933; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
6934; GENERIC-NEXT:    retq # sched: [1:1.00]
6935;
6936; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
6937; SKX:       # %bb.0:
6938; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6939; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
6940; SKX-NEXT:    retq # sched: [7:1.00]
6941  %vec = load <16 x i16>, <16 x i16>* %vp
6942  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
6943  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6944  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6945  ret <16 x i16> %res
6946}
6947
6948define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
6949; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
6950; GENERIC:       # %bb.0:
6951; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6952; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
6953; GENERIC-NEXT:    retq # sched: [1:1.00]
6954;
6955; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
6956; SKX:       # %bb.0:
6957; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6958; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
6959; SKX-NEXT:    retq # sched: [7:1.00]
6960  %vec = load <16 x i16>, <16 x i16>* %vp
6961  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
6962  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6963  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
6964  ret <16 x i16> %res
6965}
6966
6967define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
6968; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
6969; GENERIC:       # %bb.0:
6970; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
6971; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
6972; GENERIC-NEXT:    retq # sched: [1:1.00]
6973;
6974; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
6975; SKX:       # %bb.0:
6976; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
6977; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
6978; SKX-NEXT:    retq # sched: [7:1.00]
6979  %vec = load <16 x i16>, <16 x i16>* %vp
6980  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
6981  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
6982  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
6983  ret <16 x i16> %res
6984}
6985
6986define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
6987; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
6988; GENERIC:       # %bb.0:
6989; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
6990; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
6991; GENERIC-NEXT:    retq # sched: [1:1.00]
6992;
6993; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
6994; SKX:       # %bb.0:
6995; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
6996; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
6997; SKX-NEXT:    retq # sched: [7:1.00]
6998  %vec = load <16 x i16>, <16 x i16>* %vp
6999  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
7000  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
7001  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
7002  ret <16 x i16> %res
7003}
7004
7005define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
7006; GENERIC-LABEL: test_32xi16_perm_high_mask0:
7007; GENERIC:       # %bb.0:
7008; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
7009; GENERIC-NEXT:    retq # sched: [1:1.00]
7010;
7011; SKX-LABEL: test_32xi16_perm_high_mask0:
7012; SKX:       # %bb.0:
7013; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
7014; SKX-NEXT:    retq # sched: [7:1.00]
7015  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
7016  ret <32 x i16> %res
7017}
7018define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7019; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
7020; GENERIC:       # %bb.0:
7021; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7022; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
7023; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7024; GENERIC-NEXT:    retq # sched: [1:1.00]
7025;
7026; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
7027; SKX:       # %bb.0:
7028; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7029; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
7030; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7031; SKX-NEXT:    retq # sched: [7:1.00]
7032  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
7033  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7034  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7035  ret <32 x i16> %res
7036}
7037
7038define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
7039; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
7040; GENERIC:       # %bb.0:
7041; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7042; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
7043; GENERIC-NEXT:    retq # sched: [1:1.00]
7044;
7045; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
7046; SKX:       # %bb.0:
7047; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7048; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
7049; SKX-NEXT:    retq # sched: [7:1.00]
7050  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
7051  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7052  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7053  ret <32 x i16> %res
7054}
7055define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7056; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
7057; GENERIC:       # %bb.0:
7058; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7059; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
7060; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7061; GENERIC-NEXT:    retq # sched: [1:1.00]
7062;
7063; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
7064; SKX:       # %bb.0:
7065; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7066; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
7067; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7068; SKX-NEXT:    retq # sched: [7:1.00]
7069  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
7070  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7071  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7072  ret <32 x i16> %res
7073}
7074
7075define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
7076; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
7077; GENERIC:       # %bb.0:
7078; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7079; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
7080; GENERIC-NEXT:    retq # sched: [1:1.00]
7081;
7082; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
7083; SKX:       # %bb.0:
7084; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7085; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
7086; SKX-NEXT:    retq # sched: [7:1.00]
7087  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
7088  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7089  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7090  ret <32 x i16> %res
7091}
7092define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7093; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
7094; GENERIC:       # %bb.0:
7095; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7096; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
7097; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7098; GENERIC-NEXT:    retq # sched: [1:1.00]
7099;
7100; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
7101; SKX:       # %bb.0:
7102; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7103; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
7104; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7105; SKX-NEXT:    retq # sched: [7:1.00]
7106  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
7107  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7108  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7109  ret <32 x i16> %res
7110}
7111
7112define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
7113; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
7114; GENERIC:       # %bb.0:
7115; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7116; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
7117; GENERIC-NEXT:    retq # sched: [1:1.00]
7118;
7119; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
7120; SKX:       # %bb.0:
7121; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7122; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
7123; SKX-NEXT:    retq # sched: [7:1.00]
7124  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
7125  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7126  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7127  ret <32 x i16> %res
7128}
7129define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
7130; GENERIC-LABEL: test_32xi16_perm_low_mask3:
7131; GENERIC:       # %bb.0:
7132; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
7133; GENERIC-NEXT:    retq # sched: [1:1.00]
7134;
7135; SKX-LABEL: test_32xi16_perm_low_mask3:
7136; SKX:       # %bb.0:
7137; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
7138; SKX-NEXT:    retq # sched: [7:1.00]
7139  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
7140  ret <32 x i16> %res
7141}
7142define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7143; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
7144; GENERIC:       # %bb.0:
7145; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7146; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
7147; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7148; GENERIC-NEXT:    retq # sched: [1:1.00]
7149;
7150; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
7151; SKX:       # %bb.0:
7152; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7153; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
7154; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7155; SKX-NEXT:    retq # sched: [7:1.00]
7156  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
7157  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7158  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7159  ret <32 x i16> %res
7160}
7161
7162define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
7163; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
7164; GENERIC:       # %bb.0:
7165; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7166; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
7167; GENERIC-NEXT:    retq # sched: [1:1.00]
7168;
7169; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
7170; SKX:       # %bb.0:
7171; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7172; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
7173; SKX-NEXT:    retq # sched: [7:1.00]
7174  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
7175  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7176  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7177  ret <32 x i16> %res
7178}
7179define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7180; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
7181; GENERIC:       # %bb.0:
7182; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7183; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
7184; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7185; GENERIC-NEXT:    retq # sched: [1:1.00]
7186;
7187; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
7188; SKX:       # %bb.0:
7189; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7190; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
7191; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7192; SKX-NEXT:    retq # sched: [7:1.00]
7193  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
7194  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7195  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7196  ret <32 x i16> %res
7197}
7198
7199define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
7200; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
7201; GENERIC:       # %bb.0:
7202; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7203; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
7204; GENERIC-NEXT:    retq # sched: [1:1.00]
7205;
7206; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
7207; SKX:       # %bb.0:
7208; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7209; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
7210; SKX-NEXT:    retq # sched: [7:1.00]
7211  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
7212  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7213  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7214  ret <32 x i16> %res
7215}
7216define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7217; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
7218; GENERIC:       # %bb.0:
7219; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7220; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
7221; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7222; GENERIC-NEXT:    retq # sched: [1:1.00]
7223;
7224; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
7225; SKX:       # %bb.0:
7226; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7227; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
7228; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7229; SKX-NEXT:    retq # sched: [7:1.00]
7230  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
7231  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7232  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7233  ret <32 x i16> %res
7234}
7235
7236define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
7237; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
7238; GENERIC:       # %bb.0:
7239; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7240; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
7241; GENERIC-NEXT:    retq # sched: [1:1.00]
7242;
7243; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
7244; SKX:       # %bb.0:
7245; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7246; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
7247; SKX-NEXT:    retq # sched: [7:1.00]
7248  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
7249  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7250  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7251  ret <32 x i16> %res
7252}
7253define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
7254; GENERIC-LABEL: test_32xi16_perm_high_mask6:
7255; GENERIC:       # %bb.0:
7256; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
7257; GENERIC-NEXT:    retq # sched: [1:1.00]
7258;
7259; SKX-LABEL: test_32xi16_perm_high_mask6:
7260; SKX:       # %bb.0:
7261; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
7262; SKX-NEXT:    retq # sched: [7:1.00]
7263  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
7264  ret <32 x i16> %res
7265}
7266define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7267; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
7268; GENERIC:       # %bb.0:
7269; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7270; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
7271; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7272; GENERIC-NEXT:    retq # sched: [1:1.00]
7273;
7274; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
7275; SKX:       # %bb.0:
7276; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7277; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
7278; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7279; SKX-NEXT:    retq # sched: [7:1.00]
7280  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
7281  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7282  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7283  ret <32 x i16> %res
7284}
7285
7286define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
7287; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
7288; GENERIC:       # %bb.0:
7289; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7290; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
7291; GENERIC-NEXT:    retq # sched: [1:1.00]
7292;
7293; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
7294; SKX:       # %bb.0:
7295; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7296; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
7297; SKX-NEXT:    retq # sched: [7:1.00]
7298  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
7299  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7300  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7301  ret <32 x i16> %res
7302}
7303define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
7304; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
7305; GENERIC:       # %bb.0:
7306; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
7307; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
7308; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
7309; GENERIC-NEXT:    retq # sched: [1:1.00]
7310;
7311; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
7312; SKX:       # %bb.0:
7313; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
7314; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
7315; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
7316; SKX-NEXT:    retq # sched: [7:1.00]
7317  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
7318  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7319  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7320  ret <32 x i16> %res
7321}
7322
7323define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
7324; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
7325; GENERIC:       # %bb.0:
7326; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7327; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
7328; GENERIC-NEXT:    retq # sched: [1:1.00]
7329;
7330; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
7331; SKX:       # %bb.0:
7332; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7333; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
7334; SKX-NEXT:    retq # sched: [7:1.00]
7335  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
7336  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7337  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7338  ret <32 x i16> %res
7339}
7340define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
7341; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0:
7342; GENERIC:       # %bb.0:
7343; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
7344; GENERIC-NEXT:    retq # sched: [1:1.00]
7345;
7346; SKX-LABEL: test_32xi16_perm_high_mem_mask0:
7347; SKX:       # %bb.0:
7348; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
7349; SKX-NEXT:    retq # sched: [7:1.00]
7350  %vec = load <32 x i16>, <32 x i16>* %vp
7351  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
7352  ret <32 x i16> %res
7353}
7354define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7355; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
7356; GENERIC:       # %bb.0:
7357; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7358; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
7359; GENERIC-NEXT:    retq # sched: [1:1.00]
7360;
7361; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
7362; SKX:       # %bb.0:
7363; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7364; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
7365; SKX-NEXT:    retq # sched: [7:1.00]
7366  %vec = load <32 x i16>, <32 x i16>* %vp
7367  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
7368  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7369  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7370  ret <32 x i16> %res
7371}
7372
7373define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
7374; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
7375; GENERIC:       # %bb.0:
7376; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7377; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
7378; GENERIC-NEXT:    retq # sched: [1:1.00]
7379;
7380; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
7381; SKX:       # %bb.0:
7382; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7383; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
7384; SKX-NEXT:    retq # sched: [7:1.00]
7385  %vec = load <32 x i16>, <32 x i16>* %vp
7386  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
7387  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7388  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7389  ret <32 x i16> %res
7390}
7391
7392define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7393; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
7394; GENERIC:       # %bb.0:
7395; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7396; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
7397; GENERIC-NEXT:    retq # sched: [1:1.00]
7398;
7399; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
7400; SKX:       # %bb.0:
7401; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7402; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
7403; SKX-NEXT:    retq # sched: [7:1.00]
7404  %vec = load <32 x i16>, <32 x i16>* %vp
7405  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
7406  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7407  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7408  ret <32 x i16> %res
7409}
7410
7411define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
7412; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
7413; GENERIC:       # %bb.0:
7414; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7415; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
7416; GENERIC-NEXT:    retq # sched: [1:1.00]
7417;
7418; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
7419; SKX:       # %bb.0:
7420; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7421; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
7422; SKX-NEXT:    retq # sched: [7:1.00]
7423  %vec = load <32 x i16>, <32 x i16>* %vp
7424  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
7425  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7426  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7427  ret <32 x i16> %res
7428}
7429
7430define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7431; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
7432; GENERIC:       # %bb.0:
7433; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7434; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
7435; GENERIC-NEXT:    retq # sched: [1:1.00]
7436;
7437; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
7438; SKX:       # %bb.0:
7439; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7440; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
7441; SKX-NEXT:    retq # sched: [7:1.00]
7442  %vec = load <32 x i16>, <32 x i16>* %vp
7443  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
7444  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7445  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7446  ret <32 x i16> %res
7447}
7448
7449define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
7450; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
7451; GENERIC:       # %bb.0:
7452; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7453; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
7454; GENERIC-NEXT:    retq # sched: [1:1.00]
7455;
7456; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
7457; SKX:       # %bb.0:
7458; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7459; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
7460; SKX-NEXT:    retq # sched: [7:1.00]
7461  %vec = load <32 x i16>, <32 x i16>* %vp
7462  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
7463  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7464  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7465  ret <32 x i16> %res
7466}
7467
7468define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
7469; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3:
7470; GENERIC:       # %bb.0:
7471; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
7472; GENERIC-NEXT:    retq # sched: [1:1.00]
7473;
7474; SKX-LABEL: test_32xi16_perm_low_mem_mask3:
7475; SKX:       # %bb.0:
7476; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
7477; SKX-NEXT:    retq # sched: [7:1.00]
7478  %vec = load <32 x i16>, <32 x i16>* %vp
7479  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
7480  ret <32 x i16> %res
7481}
7482define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7483; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
7484; GENERIC:       # %bb.0:
7485; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7486; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
7487; GENERIC-NEXT:    retq # sched: [1:1.00]
7488;
7489; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
7490; SKX:       # %bb.0:
7491; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7492; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
7493; SKX-NEXT:    retq # sched: [7:1.00]
7494  %vec = load <32 x i16>, <32 x i16>* %vp
7495  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
7496  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7497  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7498  ret <32 x i16> %res
7499}
7500
7501define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
7502; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
7503; GENERIC:       # %bb.0:
7504; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7505; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
7506; GENERIC-NEXT:    retq # sched: [1:1.00]
7507;
7508; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
7509; SKX:       # %bb.0:
7510; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7511; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
7512; SKX-NEXT:    retq # sched: [7:1.00]
7513  %vec = load <32 x i16>, <32 x i16>* %vp
7514  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
7515  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7516  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7517  ret <32 x i16> %res
7518}
7519
7520define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7521; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
7522; GENERIC:       # %bb.0:
7523; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7524; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
7525; GENERIC-NEXT:    retq # sched: [1:1.00]
7526;
7527; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
7528; SKX:       # %bb.0:
7529; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7530; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
7531; SKX-NEXT:    retq # sched: [7:1.00]
7532  %vec = load <32 x i16>, <32 x i16>* %vp
7533  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
7534  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7535  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7536  ret <32 x i16> %res
7537}
7538
7539define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
7540; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
7541; GENERIC:       # %bb.0:
7542; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7543; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
7544; GENERIC-NEXT:    retq # sched: [1:1.00]
7545;
7546; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
7547; SKX:       # %bb.0:
7548; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7549; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
7550; SKX-NEXT:    retq # sched: [7:1.00]
7551  %vec = load <32 x i16>, <32 x i16>* %vp
7552  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
7553  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7554  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7555  ret <32 x i16> %res
7556}
7557
7558define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7559; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
7560; GENERIC:       # %bb.0:
7561; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
7562; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7563; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.50]
7564; GENERIC-NEXT:    retq # sched: [1:1.00]
7565;
7566; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
7567; SKX:       # %bb.0:
7568; SKX-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
7569; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7570; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
7571; SKX-NEXT:    retq # sched: [7:1.00]
7572  %vec = load <32 x i16>, <32 x i16>* %vp
7573  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
7574  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7575  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7576  ret <32 x i16> %res
7577}
7578
7579define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
7580; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
7581; GENERIC:       # %bb.0:
7582; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
7583; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7584; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
7585; GENERIC-NEXT:    retq # sched: [1:1.00]
7586;
7587; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
7588; SKX:       # %bb.0:
7589; SKX-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
7590; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7591; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
7592; SKX-NEXT:    retq # sched: [7:1.00]
7593  %vec = load <32 x i16>, <32 x i16>* %vp
7594  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
7595  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7596  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7597  ret <32 x i16> %res
7598}
7599
7600define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
7601; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6:
7602; GENERIC:       # %bb.0:
7603; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
7604; GENERIC-NEXT:    retq # sched: [1:1.00]
7605;
7606; SKX-LABEL: test_32xi16_perm_high_mem_mask6:
7607; SKX:       # %bb.0:
7608; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
7609; SKX-NEXT:    retq # sched: [7:1.00]
7610  %vec = load <32 x i16>, <32 x i16>* %vp
7611  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
7612  ret <32 x i16> %res
7613}
7614define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7615; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
7616; GENERIC:       # %bb.0:
7617; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7618; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
7619; GENERIC-NEXT:    retq # sched: [1:1.00]
7620;
7621; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
7622; SKX:       # %bb.0:
7623; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7624; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
7625; SKX-NEXT:    retq # sched: [7:1.00]
7626  %vec = load <32 x i16>, <32 x i16>* %vp
7627  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
7628  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7629  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7630  ret <32 x i16> %res
7631}
7632
7633define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
7634; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
7635; GENERIC:       # %bb.0:
7636; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7637; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
7638; GENERIC-NEXT:    retq # sched: [1:1.00]
7639;
7640; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
7641; SKX:       # %bb.0:
7642; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7643; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
7644; SKX-NEXT:    retq # sched: [7:1.00]
7645  %vec = load <32 x i16>, <32 x i16>* %vp
7646  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
7647  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7648  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7649  ret <32 x i16> %res
7650}
7651
7652define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
7653; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
7654; GENERIC:       # %bb.0:
7655; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
7656; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
7657; GENERIC-NEXT:    retq # sched: [1:1.00]
7658;
7659; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
7660; SKX:       # %bb.0:
7661; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
7662; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
7663; SKX-NEXT:    retq # sched: [7:1.00]
7664  %vec = load <32 x i16>, <32 x i16>* %vp
7665  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
7666  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7667  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
7668  ret <32 x i16> %res
7669}
7670
7671define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
7672; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
7673; GENERIC:       # %bb.0:
7674; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
7675; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
7676; GENERIC-NEXT:    retq # sched: [1:1.00]
7677;
7678; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
7679; SKX:       # %bb.0:
7680; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
7681; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
7682; SKX-NEXT:    retq # sched: [7:1.00]
7683  %vec = load <32 x i16>, <32 x i16>* %vp
7684  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
7685  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
7686  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
7687  ret <32 x i16> %res
7688}
7689
7690define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
7691; GENERIC-LABEL: test_4xi32_perm_mask0:
7692; GENERIC:       # %bb.0:
7693; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
7694; GENERIC-NEXT:    retq # sched: [1:1.00]
7695;
7696; SKX-LABEL: test_4xi32_perm_mask0:
7697; SKX:       # %bb.0:
7698; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
7699; SKX-NEXT:    retq # sched: [7:1.00]
7700  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
7701  ret <4 x i32> %res
7702}
7703define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
7704; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
7705; GENERIC:       # %bb.0:
7706; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
7707; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:0.50]
7708; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7709; GENERIC-NEXT:    retq # sched: [1:1.00]
7710;
7711; SKX-LABEL: test_masked_4xi32_perm_mask0:
7712; SKX:       # %bb.0:
7713; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
7714; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
7715; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7716; SKX-NEXT:    retq # sched: [7:1.00]
7717  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
7718  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7719  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
7720  ret <4 x i32> %res
7721}
7722
7723define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
7724; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
7725; GENERIC:       # %bb.0:
7726; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
7727; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:0.50]
7728; GENERIC-NEXT:    retq # sched: [1:1.00]
7729;
7730; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
7731; SKX:       # %bb.0:
7732; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
7733; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
7734; SKX-NEXT:    retq # sched: [7:1.00]
7735  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
7736  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7737  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
7738  ret <4 x i32> %res
7739}
7740define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
7741; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
7742; GENERIC:       # %bb.0:
7743; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
7744; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:0.50]
7745; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7746; GENERIC-NEXT:    retq # sched: [1:1.00]
7747;
7748; SKX-LABEL: test_masked_4xi32_perm_mask1:
7749; SKX:       # %bb.0:
7750; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
7751; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
7752; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7753; SKX-NEXT:    retq # sched: [7:1.00]
7754  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
7755  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7756  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
7757  ret <4 x i32> %res
7758}
7759
7760define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
7761; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
7762; GENERIC:       # %bb.0:
7763; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
7764; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:0.50]
7765; GENERIC-NEXT:    retq # sched: [1:1.00]
7766;
7767; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
7768; SKX:       # %bb.0:
7769; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
7770; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
7771; SKX-NEXT:    retq # sched: [7:1.00]
7772  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
7773  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7774  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
7775  ret <4 x i32> %res
7776}
7777define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
7778; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
7779; GENERIC:       # %bb.0:
7780; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
7781; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:0.50]
7782; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7783; GENERIC-NEXT:    retq # sched: [1:1.00]
7784;
7785; SKX-LABEL: test_masked_4xi32_perm_mask2:
7786; SKX:       # %bb.0:
7787; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
7788; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
7789; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7790; SKX-NEXT:    retq # sched: [7:1.00]
7791  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
7792  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7793  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
7794  ret <4 x i32> %res
7795}
7796
7797define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
7798; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
7799; GENERIC:       # %bb.0:
7800; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
7801; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:0.50]
7802; GENERIC-NEXT:    retq # sched: [1:1.00]
7803;
7804; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
7805; SKX:       # %bb.0:
7806; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
7807; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
7808; SKX-NEXT:    retq # sched: [7:1.00]
7809  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
7810  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7811  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
7812  ret <4 x i32> %res
7813}
7814define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
7815; GENERIC-LABEL: test_4xi32_perm_mask3:
7816; GENERIC:       # %bb.0:
7817; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
7818; GENERIC-NEXT:    retq # sched: [1:1.00]
7819;
7820; SKX-LABEL: test_4xi32_perm_mask3:
7821; SKX:       # %bb.0:
7822; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
7823; SKX-NEXT:    retq # sched: [7:1.00]
7824  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
7825  ret <4 x i32> %res
7826}
7827define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
7828; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
7829; GENERIC:       # %bb.0:
7830; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
7831; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:0.50]
7832; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7833; GENERIC-NEXT:    retq # sched: [1:1.00]
7834;
7835; SKX-LABEL: test_masked_4xi32_perm_mask3:
7836; SKX:       # %bb.0:
7837; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
7838; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
7839; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
7840; SKX-NEXT:    retq # sched: [7:1.00]
7841  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
7842  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7843  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
7844  ret <4 x i32> %res
7845}
7846
7847define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
7848; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
7849; GENERIC:       # %bb.0:
7850; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
7851; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:0.50]
7852; GENERIC-NEXT:    retq # sched: [1:1.00]
7853;
7854; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
7855; SKX:       # %bb.0:
7856; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
7857; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
7858; SKX-NEXT:    retq # sched: [7:1.00]
7859  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
7860  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7861  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
7862  ret <4 x i32> %res
7863}
7864define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
7865; GENERIC-LABEL: test_4xi32_perm_mem_mask0:
7866; GENERIC:       # %bb.0:
7867; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
7868; GENERIC-NEXT:    retq # sched: [1:1.00]
7869;
7870; SKX-LABEL: test_4xi32_perm_mem_mask0:
7871; SKX:       # %bb.0:
7872; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
7873; SKX-NEXT:    retq # sched: [7:1.00]
7874  %vec = load <4 x i32>, <4 x i32>* %vp
7875  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
7876  ret <4 x i32> %res
7877}
7878define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
7879; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
7880; GENERIC:       # %bb.0:
7881; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
7882; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:0.50]
7883; GENERIC-NEXT:    retq # sched: [1:1.00]
7884;
7885; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
7886; SKX:       # %bb.0:
7887; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
7888; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
7889; SKX-NEXT:    retq # sched: [7:1.00]
7890  %vec = load <4 x i32>, <4 x i32>* %vp
7891  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
7892  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7893  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
7894  ret <4 x i32> %res
7895}
7896
7897define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
7898; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
7899; GENERIC:       # %bb.0:
7900; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
7901; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:0.50]
7902; GENERIC-NEXT:    retq # sched: [1:1.00]
7903;
7904; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
7905; SKX:       # %bb.0:
7906; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
7907; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
7908; SKX-NEXT:    retq # sched: [7:1.00]
7909  %vec = load <4 x i32>, <4 x i32>* %vp
7910  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
7911  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7912  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
7913  ret <4 x i32> %res
7914}
7915
7916define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
7917; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
7918; GENERIC:       # %bb.0:
7919; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
7920; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:0.50]
7921; GENERIC-NEXT:    retq # sched: [1:1.00]
7922;
7923; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
7924; SKX:       # %bb.0:
7925; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
7926; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
7927; SKX-NEXT:    retq # sched: [7:1.00]
7928  %vec = load <4 x i32>, <4 x i32>* %vp
7929  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
7930  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7931  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
7932  ret <4 x i32> %res
7933}
7934
7935define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
7936; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
7937; GENERIC:       # %bb.0:
7938; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
7939; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:0.50]
7940; GENERIC-NEXT:    retq # sched: [1:1.00]
7941;
7942; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
7943; SKX:       # %bb.0:
7944; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
7945; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
7946; SKX-NEXT:    retq # sched: [7:1.00]
7947  %vec = load <4 x i32>, <4 x i32>* %vp
7948  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
7949  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7950  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
7951  ret <4 x i32> %res
7952}
7953
7954define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
7955; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
7956; GENERIC:       # %bb.0:
7957; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
7958; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:0.50]
7959; GENERIC-NEXT:    retq # sched: [1:1.00]
7960;
7961; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
7962; SKX:       # %bb.0:
7963; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
7964; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
7965; SKX-NEXT:    retq # sched: [7:1.00]
7966  %vec = load <4 x i32>, <4 x i32>* %vp
7967  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
7968  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7969  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
7970  ret <4 x i32> %res
7971}
7972
7973define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
7974; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
7975; GENERIC:       # %bb.0:
7976; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
7977; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:0.50]
7978; GENERIC-NEXT:    retq # sched: [1:1.00]
7979;
7980; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
7981; SKX:       # %bb.0:
7982; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
7983; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
7984; SKX-NEXT:    retq # sched: [7:1.00]
7985  %vec = load <4 x i32>, <4 x i32>* %vp
7986  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
7987  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
7988  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
7989  ret <4 x i32> %res
7990}
7991
7992define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
7993; GENERIC-LABEL: test_4xi32_perm_mem_mask3:
7994; GENERIC:       # %bb.0:
7995; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
7996; GENERIC-NEXT:    retq # sched: [1:1.00]
7997;
7998; SKX-LABEL: test_4xi32_perm_mem_mask3:
7999; SKX:       # %bb.0:
8000; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
8001; SKX-NEXT:    retq # sched: [7:1.00]
8002  %vec = load <4 x i32>, <4 x i32>* %vp
8003  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
8004  ret <4 x i32> %res
8005}
8006define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
8007; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
8008; GENERIC:       # %bb.0:
8009; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
8010; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:0.50]
8011; GENERIC-NEXT:    retq # sched: [1:1.00]
8012;
8013; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
8014; SKX:       # %bb.0:
8015; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
8016; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
8017; SKX-NEXT:    retq # sched: [7:1.00]
8018  %vec = load <4 x i32>, <4 x i32>* %vp
8019  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
8020  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
8021  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
8022  ret <4 x i32> %res
8023}
8024
8025define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
8026; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
8027; GENERIC:       # %bb.0:
8028; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
8029; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:0.50]
8030; GENERIC-NEXT:    retq # sched: [1:1.00]
8031;
8032; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
8033; SKX:       # %bb.0:
8034; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
8035; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
8036; SKX-NEXT:    retq # sched: [7:1.00]
8037  %vec = load <4 x i32>, <4 x i32>* %vp
8038  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
8039  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
8040  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
8041  ret <4 x i32> %res
8042}
8043
8044define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
8045; GENERIC-LABEL: test2_8xi32_perm_mask0:
8046; GENERIC:       # %bb.0:
8047; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
8048; GENERIC-NEXT:    retq # sched: [1:1.00]
8049;
8050; SKX-LABEL: test2_8xi32_perm_mask0:
8051; SKX:       # %bb.0:
8052; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
8053; SKX-NEXT:    retq # sched: [7:1.00]
8054  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
8055  ret <8 x i32> %res
8056}
8057define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
8058; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
8059; GENERIC:       # %bb.0:
8060; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8061; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
8062; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
8063; GENERIC-NEXT:    retq # sched: [1:1.00]
8064;
8065; SKX-LABEL: test2_masked_8xi32_perm_mask0:
8066; SKX:       # %bb.0:
8067; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8068; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
8069; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
8070; SKX-NEXT:    retq # sched: [7:1.00]
8071  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
8072  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8073  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8074  ret <8 x i32> %res
8075}
8076
8077define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
8078; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
8079; GENERIC:       # %bb.0:
8080; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8081; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
8082; GENERIC-NEXT:    retq # sched: [1:1.00]
8083;
8084; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
8085; SKX:       # %bb.0:
8086; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8087; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
8088; SKX-NEXT:    retq # sched: [7:1.00]
8089  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
8090  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8091  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8092  ret <8 x i32> %res
8093}
8094define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
8095; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
8096; GENERIC:       # %bb.0:
8097; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8098; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
8099; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
8100; GENERIC-NEXT:    retq # sched: [1:1.00]
8101;
8102; SKX-LABEL: test2_masked_8xi32_perm_mask1:
8103; SKX:       # %bb.0:
8104; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8105; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
8106; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
8107; SKX-NEXT:    retq # sched: [7:1.00]
8108  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
8109  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8110  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8111  ret <8 x i32> %res
8112}
8113
8114define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
8115; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
8116; GENERIC:       # %bb.0:
8117; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8118; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
8119; GENERIC-NEXT:    retq # sched: [1:1.00]
8120;
8121; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
8122; SKX:       # %bb.0:
8123; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8124; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
8125; SKX-NEXT:    retq # sched: [7:1.00]
8126  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
8127  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8128  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8129  ret <8 x i32> %res
8130}
8131define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
8132; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
8133; GENERIC:       # %bb.0:
8134; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8135; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
8136; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
8137; GENERIC-NEXT:    retq # sched: [1:1.00]
8138;
8139; SKX-LABEL: test2_masked_8xi32_perm_mask2:
8140; SKX:       # %bb.0:
8141; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8142; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
8143; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
8144; SKX-NEXT:    retq # sched: [7:1.00]
8145  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
8146  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8147  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8148  ret <8 x i32> %res
8149}
8150
8151define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
8152; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
8153; GENERIC:       # %bb.0:
8154; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8155; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
8156; GENERIC-NEXT:    retq # sched: [1:1.00]
8157;
8158; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
8159; SKX:       # %bb.0:
8160; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8161; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
8162; SKX-NEXT:    retq # sched: [7:1.00]
8163  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
8164  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8165  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8166  ret <8 x i32> %res
8167}
8168define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
8169; GENERIC-LABEL: test2_8xi32_perm_mask3:
8170; GENERIC:       # %bb.0:
8171; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
8172; GENERIC-NEXT:    retq # sched: [1:1.00]
8173;
8174; SKX-LABEL: test2_8xi32_perm_mask3:
8175; SKX:       # %bb.0:
8176; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
8177; SKX-NEXT:    retq # sched: [7:1.00]
8178  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
8179  ret <8 x i32> %res
8180}
8181define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
8182; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
8183; GENERIC:       # %bb.0:
8184; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8185; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
8186; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
8187; GENERIC-NEXT:    retq # sched: [1:1.00]
8188;
8189; SKX-LABEL: test2_masked_8xi32_perm_mask3:
8190; SKX:       # %bb.0:
8191; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8192; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
8193; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
8194; SKX-NEXT:    retq # sched: [7:1.00]
8195  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
8196  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8197  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8198  ret <8 x i32> %res
8199}
8200
8201define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
8202; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
8203; GENERIC:       # %bb.0:
8204; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8205; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
8206; GENERIC-NEXT:    retq # sched: [1:1.00]
8207;
8208; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
8209; SKX:       # %bb.0:
8210; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8211; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
8212; SKX-NEXT:    retq # sched: [7:1.00]
8213  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
8214  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8215  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8216  ret <8 x i32> %res
8217}
8218define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
8219; GENERIC-LABEL: test2_8xi32_perm_mem_mask0:
8220; GENERIC:       # %bb.0:
8221; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
8222; GENERIC-NEXT:    retq # sched: [1:1.00]
8223;
8224; SKX-LABEL: test2_8xi32_perm_mem_mask0:
8225; SKX:       # %bb.0:
8226; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
8227; SKX-NEXT:    retq # sched: [7:1.00]
8228  %vec = load <8 x i32>, <8 x i32>* %vp
8229  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
8230  ret <8 x i32> %res
8231}
8232define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
8233; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
8234; GENERIC:       # %bb.0:
8235; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8236; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
8237; GENERIC-NEXT:    retq # sched: [1:1.00]
8238;
8239; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
8240; SKX:       # %bb.0:
8241; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8242; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
8243; SKX-NEXT:    retq # sched: [7:1.00]
8244  %vec = load <8 x i32>, <8 x i32>* %vp
8245  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
8246  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8247  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8248  ret <8 x i32> %res
8249}
8250
8251define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
8252; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
8253; GENERIC:       # %bb.0:
8254; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
8255; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
8256; GENERIC-NEXT:    retq # sched: [1:1.00]
8257;
8258; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
8259; SKX:       # %bb.0:
8260; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
8261; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
8262; SKX-NEXT:    retq # sched: [7:1.00]
8263  %vec = load <8 x i32>, <8 x i32>* %vp
8264  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
8265  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8266  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8267  ret <8 x i32> %res
8268}
8269
8270define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
8271; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
8272; GENERIC:       # %bb.0:
8273; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8274; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
8275; GENERIC-NEXT:    retq # sched: [1:1.00]
8276;
8277; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
8278; SKX:       # %bb.0:
8279; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8280; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
8281; SKX-NEXT:    retq # sched: [7:1.00]
8282  %vec = load <8 x i32>, <8 x i32>* %vp
8283  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
8284  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8285  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8286  ret <8 x i32> %res
8287}
8288
8289define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
8290; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
8291; GENERIC:       # %bb.0:
8292; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
8293; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
8294; GENERIC-NEXT:    retq # sched: [1:1.00]
8295;
8296; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
8297; SKX:       # %bb.0:
8298; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
8299; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
8300; SKX-NEXT:    retq # sched: [7:1.00]
8301  %vec = load <8 x i32>, <8 x i32>* %vp
8302  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
8303  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8304  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8305  ret <8 x i32> %res
8306}
8307
8308define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
8309; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
8310; GENERIC:       # %bb.0:
8311; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8312; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
8313; GENERIC-NEXT:    retq # sched: [1:1.00]
8314;
8315; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
8316; SKX:       # %bb.0:
8317; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8318; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
8319; SKX-NEXT:    retq # sched: [7:1.00]
8320  %vec = load <8 x i32>, <8 x i32>* %vp
8321  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
8322  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8323  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8324  ret <8 x i32> %res
8325}
8326
8327define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
8328; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
8329; GENERIC:       # %bb.0:
8330; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
8331; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
8332; GENERIC-NEXT:    retq # sched: [1:1.00]
8333;
8334; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
8335; SKX:       # %bb.0:
8336; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
8337; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
8338; SKX-NEXT:    retq # sched: [7:1.00]
8339  %vec = load <8 x i32>, <8 x i32>* %vp
8340  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
8341  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8342  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8343  ret <8 x i32> %res
8344}
8345
8346define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
8347; GENERIC-LABEL: test2_8xi32_perm_mem_mask3:
8348; GENERIC:       # %bb.0:
8349; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
8350; GENERIC-NEXT:    retq # sched: [1:1.00]
8351;
8352; SKX-LABEL: test2_8xi32_perm_mem_mask3:
8353; SKX:       # %bb.0:
8354; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
8355; SKX-NEXT:    retq # sched: [7:1.00]
8356  %vec = load <8 x i32>, <8 x i32>* %vp
8357  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
8358  ret <8 x i32> %res
8359}
8360define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
8361; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
8362; GENERIC:       # %bb.0:
8363; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8364; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
8365; GENERIC-NEXT:    retq # sched: [1:1.00]
8366;
8367; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
8368; SKX:       # %bb.0:
8369; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8370; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
8371; SKX-NEXT:    retq # sched: [7:1.00]
8372  %vec = load <8 x i32>, <8 x i32>* %vp
8373  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
8374  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8375  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
8376  ret <8 x i32> %res
8377}
8378
8379define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
8380; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
8381; GENERIC:       # %bb.0:
8382; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
8383; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
8384; GENERIC-NEXT:    retq # sched: [1:1.00]
8385;
8386; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
8387; SKX:       # %bb.0:
8388; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
8389; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
8390; SKX-NEXT:    retq # sched: [7:1.00]
8391  %vec = load <8 x i32>, <8 x i32>* %vp
8392  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
8393  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8394  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
8395  ret <8 x i32> %res
8396}
8397
8398define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
8399; GENERIC-LABEL: test2_16xi32_perm_mask0:
8400; GENERIC:       # %bb.0:
8401; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
8402; GENERIC-NEXT:    retq # sched: [1:1.00]
8403;
8404; SKX-LABEL: test2_16xi32_perm_mask0:
8405; SKX:       # %bb.0:
8406; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
8407; SKX-NEXT:    retq # sched: [7:1.00]
8408  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
8409  ret <16 x i32> %res
8410}
8411define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
8412; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
8413; GENERIC:       # %bb.0:
8414; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
8415; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
8416; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
8417; GENERIC-NEXT:    retq # sched: [1:1.00]
8418;
8419; SKX-LABEL: test2_masked_16xi32_perm_mask0:
8420; SKX:       # %bb.0:
8421; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
8422; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
8423; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
8424; SKX-NEXT:    retq # sched: [7:1.00]
8425  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
8426  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8427  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8428  ret <16 x i32> %res
8429}
8430
8431define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
8432; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
8433; GENERIC:       # %bb.0:
8434; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8435; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
8436; GENERIC-NEXT:    retq # sched: [1:1.00]
8437;
8438; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
8439; SKX:       # %bb.0:
8440; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8441; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
8442; SKX-NEXT:    retq # sched: [7:1.00]
8443  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
8444  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8445  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8446  ret <16 x i32> %res
8447}
8448define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
8449; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
8450; GENERIC:       # %bb.0:
8451; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
8452; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
8453; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
8454; GENERIC-NEXT:    retq # sched: [1:1.00]
8455;
8456; SKX-LABEL: test2_masked_16xi32_perm_mask1:
8457; SKX:       # %bb.0:
8458; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
8459; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
8460; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
8461; SKX-NEXT:    retq # sched: [7:1.00]
8462  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
8463  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8464  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8465  ret <16 x i32> %res
8466}
8467
8468define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
8469; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
8470; GENERIC:       # %bb.0:
8471; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8472; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
8473; GENERIC-NEXT:    retq # sched: [1:1.00]
8474;
8475; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
8476; SKX:       # %bb.0:
8477; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8478; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
8479; SKX-NEXT:    retq # sched: [7:1.00]
8480  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
8481  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8482  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8483  ret <16 x i32> %res
8484}
8485define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
8486; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
8487; GENERIC:       # %bb.0:
8488; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
8489; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
8490; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
8491; GENERIC-NEXT:    retq # sched: [1:1.00]
8492;
8493; SKX-LABEL: test2_masked_16xi32_perm_mask2:
8494; SKX:       # %bb.0:
8495; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
8496; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
8497; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
8498; SKX-NEXT:    retq # sched: [7:1.00]
8499  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
8500  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8501  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8502  ret <16 x i32> %res
8503}
8504
8505define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
8506; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
8507; GENERIC:       # %bb.0:
8508; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8509; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
8510; GENERIC-NEXT:    retq # sched: [1:1.00]
8511;
8512; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
8513; SKX:       # %bb.0:
8514; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8515; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
8516; SKX-NEXT:    retq # sched: [7:1.00]
8517  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
8518  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8519  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8520  ret <16 x i32> %res
8521}
8522define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
8523; GENERIC-LABEL: test2_16xi32_perm_mask3:
8524; GENERIC:       # %bb.0:
8525; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
8526; GENERIC-NEXT:    retq # sched: [1:1.00]
8527;
8528; SKX-LABEL: test2_16xi32_perm_mask3:
8529; SKX:       # %bb.0:
8530; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
8531; SKX-NEXT:    retq # sched: [7:1.00]
8532  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
8533  ret <16 x i32> %res
8534}
8535define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
8536; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
8537; GENERIC:       # %bb.0:
8538; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
8539; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
8540; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
8541; GENERIC-NEXT:    retq # sched: [1:1.00]
8542;
8543; SKX-LABEL: test2_masked_16xi32_perm_mask3:
8544; SKX:       # %bb.0:
8545; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
8546; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
8547; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
8548; SKX-NEXT:    retq # sched: [7:1.00]
8549  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
8550  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8551  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8552  ret <16 x i32> %res
8553}
8554
8555define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
8556; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
8557; GENERIC:       # %bb.0:
8558; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8559; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
8560; GENERIC-NEXT:    retq # sched: [1:1.00]
8561;
8562; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
8563; SKX:       # %bb.0:
8564; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8565; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
8566; SKX-NEXT:    retq # sched: [7:1.00]
8567  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
8568  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8569  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8570  ret <16 x i32> %res
8571}
8572define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
8573; GENERIC-LABEL: test2_16xi32_perm_mem_mask0:
8574; GENERIC:       # %bb.0:
8575; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
8576; GENERIC-NEXT:    retq # sched: [1:1.00]
8577;
8578; SKX-LABEL: test2_16xi32_perm_mem_mask0:
8579; SKX:       # %bb.0:
8580; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
8581; SKX-NEXT:    retq # sched: [7:1.00]
8582  %vec = load <16 x i32>, <16 x i32>* %vp
8583  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
8584  ret <16 x i32> %res
8585}
8586define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
8587; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
8588; GENERIC:       # %bb.0:
8589; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8590; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
8591; GENERIC-NEXT:    retq # sched: [1:1.00]
8592;
8593; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
8594; SKX:       # %bb.0:
8595; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8596; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
8597; SKX-NEXT:    retq # sched: [7:1.00]
8598  %vec = load <16 x i32>, <16 x i32>* %vp
8599  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
8600  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8601  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8602  ret <16 x i32> %res
8603}
8604
8605define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
8606; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
8607; GENERIC:       # %bb.0:
8608; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
8609; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
8610; GENERIC-NEXT:    retq # sched: [1:1.00]
8611;
8612; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
8613; SKX:       # %bb.0:
8614; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
8615; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
8616; SKX-NEXT:    retq # sched: [7:1.00]
8617  %vec = load <16 x i32>, <16 x i32>* %vp
8618  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
8619  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8620  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8621  ret <16 x i32> %res
8622}
8623
8624define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
8625; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
8626; GENERIC:       # %bb.0:
8627; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8628; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
8629; GENERIC-NEXT:    retq # sched: [1:1.00]
8630;
8631; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
8632; SKX:       # %bb.0:
8633; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8634; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
8635; SKX-NEXT:    retq # sched: [7:1.00]
8636  %vec = load <16 x i32>, <16 x i32>* %vp
8637  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
8638  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8639  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8640  ret <16 x i32> %res
8641}
8642
8643define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
8644; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
8645; GENERIC:       # %bb.0:
8646; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
8647; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
8648; GENERIC-NEXT:    retq # sched: [1:1.00]
8649;
8650; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
8651; SKX:       # %bb.0:
8652; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
8653; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
8654; SKX-NEXT:    retq # sched: [7:1.00]
8655  %vec = load <16 x i32>, <16 x i32>* %vp
8656  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
8657  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8658  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8659  ret <16 x i32> %res
8660}
8661
8662define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
8663; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
8664; GENERIC:       # %bb.0:
8665; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8666; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
8667; GENERIC-NEXT:    retq # sched: [1:1.00]
8668;
8669; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
8670; SKX:       # %bb.0:
8671; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8672; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
8673; SKX-NEXT:    retq # sched: [7:1.00]
8674  %vec = load <16 x i32>, <16 x i32>* %vp
8675  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
8676  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8677  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8678  ret <16 x i32> %res
8679}
8680
8681define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
8682; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
8683; GENERIC:       # %bb.0:
8684; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
8685; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
8686; GENERIC-NEXT:    retq # sched: [1:1.00]
8687;
8688; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
8689; SKX:       # %bb.0:
8690; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
8691; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
8692; SKX-NEXT:    retq # sched: [7:1.00]
8693  %vec = load <16 x i32>, <16 x i32>* %vp
8694  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
8695  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8696  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8697  ret <16 x i32> %res
8698}
8699
8700define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
8701; GENERIC-LABEL: test2_16xi32_perm_mem_mask3:
8702; GENERIC:       # %bb.0:
8703; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
8704; GENERIC-NEXT:    retq # sched: [1:1.00]
8705;
8706; SKX-LABEL: test2_16xi32_perm_mem_mask3:
8707; SKX:       # %bb.0:
8708; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
8709; SKX-NEXT:    retq # sched: [7:1.00]
8710  %vec = load <16 x i32>, <16 x i32>* %vp
8711  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
8712  ret <16 x i32> %res
8713}
8714define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
8715; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
8716; GENERIC:       # %bb.0:
8717; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
8718; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
8719; GENERIC-NEXT:    retq # sched: [1:1.00]
8720;
8721; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
8722; SKX:       # %bb.0:
8723; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
8724; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
8725; SKX-NEXT:    retq # sched: [7:1.00]
8726  %vec = load <16 x i32>, <16 x i32>* %vp
8727  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
8728  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8729  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
8730  ret <16 x i32> %res
8731}
8732
8733define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
8734; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
8735; GENERIC:       # %bb.0:
8736; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
8737; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
8738; GENERIC-NEXT:    retq # sched: [1:1.00]
8739;
8740; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
8741; SKX:       # %bb.0:
8742; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
8743; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
8744; SKX-NEXT:    retq # sched: [7:1.00]
8745  %vec = load <16 x i32>, <16 x i32>* %vp
8746  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
8747  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
8748  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
8749  ret <16 x i32> %res
8750}
8751
8752define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
8753; GENERIC-LABEL: test2_8xfloat_shuff_mask0:
8754; GENERIC:       # %bb.0:
8755; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
8756; GENERIC-NEXT:    retq # sched: [1:1.00]
8757;
8758; SKX-LABEL: test2_8xfloat_shuff_mask0:
8759; SKX:       # %bb.0:
8760; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
8761; SKX-NEXT:    retq # sched: [7:1.00]
8762  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8763  ret <8 x float> %res
8764}
8765define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
8766; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
8767; GENERIC:       # %bb.0:
8768; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
8769; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
8770; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
8771; GENERIC-NEXT:    retq # sched: [1:1.00]
8772;
8773; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
8774; SKX:       # %bb.0:
8775; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
8776; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
8777; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
8778; SKX-NEXT:    retq # sched: [7:1.00]
8779  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8780  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8781  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
8782  ret <8 x float> %res
8783}
8784
8785define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
8786; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
8787; GENERIC:       # %bb.0:
8788; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8789; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
8790; GENERIC-NEXT:    retq # sched: [1:1.00]
8791;
8792; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
8793; SKX:       # %bb.0:
8794; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8795; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
8796; SKX-NEXT:    retq # sched: [7:1.00]
8797  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8798  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8799  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
8800  ret <8 x float> %res
8801}
8802define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
8803; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
8804; GENERIC:       # %bb.0:
8805; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
8806; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
8807; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
8808; GENERIC-NEXT:    retq # sched: [1:1.00]
8809;
8810; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
8811; SKX:       # %bb.0:
8812; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
8813; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
8814; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
8815; SKX-NEXT:    retq # sched: [7:1.00]
8816  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8817  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8818  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
8819  ret <8 x float> %res
8820}
8821
8822define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
8823; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
8824; GENERIC:       # %bb.0:
8825; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8826; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
8827; GENERIC-NEXT:    retq # sched: [1:1.00]
8828;
8829; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
8830; SKX:       # %bb.0:
8831; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8832; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
8833; SKX-NEXT:    retq # sched: [7:1.00]
8834  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8835  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8836  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
8837  ret <8 x float> %res
8838}
8839define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
8840; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
8841; GENERIC:       # %bb.0:
8842; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
8843; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
8844; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
8845; GENERIC-NEXT:    retq # sched: [1:1.00]
8846;
8847; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
8848; SKX:       # %bb.0:
8849; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
8850; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
8851; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
8852; SKX-NEXT:    retq # sched: [7:1.00]
8853  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
8854  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8855  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
8856  ret <8 x float> %res
8857}
8858
8859define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
8860; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
8861; GENERIC:       # %bb.0:
8862; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8863; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
8864; GENERIC-NEXT:    retq # sched: [1:1.00]
8865;
8866; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
8867; SKX:       # %bb.0:
8868; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8869; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
8870; SKX-NEXT:    retq # sched: [7:1.00]
8871  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
8872  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8873  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
8874  ret <8 x float> %res
8875}
8876define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
8877; GENERIC-LABEL: test2_8xfloat_shuff_mask3:
8878; GENERIC:       # %bb.0:
8879; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
8880; GENERIC-NEXT:    retq # sched: [1:1.00]
8881;
8882; SKX-LABEL: test2_8xfloat_shuff_mask3:
8883; SKX:       # %bb.0:
8884; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
8885; SKX-NEXT:    retq # sched: [7:1.00]
8886  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8887  ret <8 x float> %res
8888}
8889define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
8890; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
8891; GENERIC:       # %bb.0:
8892; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
8893; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
8894; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
8895; GENERIC-NEXT:    retq # sched: [1:1.00]
8896;
8897; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
8898; SKX:       # %bb.0:
8899; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
8900; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
8901; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
8902; SKX-NEXT:    retq # sched: [7:1.00]
8903  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8904  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8905  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
8906  ret <8 x float> %res
8907}
8908
8909define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
8910; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
8911; GENERIC:       # %bb.0:
8912; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8913; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
8914; GENERIC-NEXT:    retq # sched: [1:1.00]
8915;
8916; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
8917; SKX:       # %bb.0:
8918; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8919; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
8920; SKX-NEXT:    retq # sched: [7:1.00]
8921  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
8922  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8923  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
8924  ret <8 x float> %res
8925}
8926define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
8927; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0:
8928; GENERIC:       # %bb.0:
8929; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
8930; GENERIC-NEXT:    retq # sched: [1:1.00]
8931;
8932; SKX-LABEL: test_8xfloat_shuff_mem_mask0:
8933; SKX:       # %bb.0:
8934; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
8935; SKX-NEXT:    retq # sched: [7:1.00]
8936  %vec2 = load <8 x float>, <8 x float>* %vec2p
8937  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
8938  ret <8 x float> %res
8939}
8940define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
8941; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
8942; GENERIC:       # %bb.0:
8943; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8944; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
8945; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
8946; GENERIC-NEXT:    retq # sched: [1:1.00]
8947;
8948; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
8949; SKX:       # %bb.0:
8950; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8951; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
8952; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
8953; SKX-NEXT:    retq # sched: [7:1.00]
8954  %vec2 = load <8 x float>, <8 x float>* %vec2p
8955  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
8956  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8957  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
8958  ret <8 x float> %res
8959}
8960
8961define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
8962; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
8963; GENERIC:       # %bb.0:
8964; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
8965; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
8966; GENERIC-NEXT:    retq # sched: [1:1.00]
8967;
8968; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
8969; SKX:       # %bb.0:
8970; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
8971; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
8972; SKX-NEXT:    retq # sched: [7:1.00]
8973  %vec2 = load <8 x float>, <8 x float>* %vec2p
8974  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
8975  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8976  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
8977  ret <8 x float> %res
8978}
8979
8980define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
8981; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
8982; GENERIC:       # %bb.0:
8983; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
8984; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
8985; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
8986; GENERIC-NEXT:    retq # sched: [1:1.00]
8987;
8988; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
8989; SKX:       # %bb.0:
8990; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
8991; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
8992; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
8993; SKX-NEXT:    retq # sched: [7:1.00]
8994  %vec2 = load <8 x float>, <8 x float>* %vec2p
8995  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
8996  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
8997  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
8998  ret <8 x float> %res
8999}
9000
9001define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
9002; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
9003; GENERIC:       # %bb.0:
9004; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
9005; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
9006; GENERIC-NEXT:    retq # sched: [1:1.00]
9007;
9008; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
9009; SKX:       # %bb.0:
9010; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
9011; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
9012; SKX-NEXT:    retq # sched: [7:1.00]
9013  %vec2 = load <8 x float>, <8 x float>* %vec2p
9014  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
9015  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
9016  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
9017  ret <8 x float> %res
9018}
9019
9020define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
9021; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
9022; GENERIC:       # %bb.0:
9023; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
9024; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
9025; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
9026; GENERIC-NEXT:    retq # sched: [1:1.00]
9027;
9028; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
9029; SKX:       # %bb.0:
9030; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
9031; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
9032; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
9033; SKX-NEXT:    retq # sched: [7:1.00]
9034  %vec2 = load <8 x float>, <8 x float>* %vec2p
9035  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
9036  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
9037  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
9038  ret <8 x float> %res
9039}
9040
9041define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
9042; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
9043; GENERIC:       # %bb.0:
9044; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
9045; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
9046; GENERIC-NEXT:    retq # sched: [1:1.00]
9047;
9048; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
9049; SKX:       # %bb.0:
9050; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
9051; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
9052; SKX-NEXT:    retq # sched: [7:1.00]
9053  %vec2 = load <8 x float>, <8 x float>* %vec2p
9054  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
9055  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
9056  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
9057  ret <8 x float> %res
9058}
9059
9060define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
9061; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3:
9062; GENERIC:       # %bb.0:
9063; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
9064; GENERIC-NEXT:    retq # sched: [1:1.00]
9065;
9066; SKX-LABEL: test_8xfloat_shuff_mem_mask3:
9067; SKX:       # %bb.0:
9068; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
9069; SKX-NEXT:    retq # sched: [7:1.00]
9070  %vec2 = load <8 x float>, <8 x float>* %vec2p
9071  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
9072  ret <8 x float> %res
9073}
9074define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
9075; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
9076; GENERIC:       # %bb.0:
9077; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
9078; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
9079; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
9080; GENERIC-NEXT:    retq # sched: [1:1.00]
9081;
9082; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
9083; SKX:       # %bb.0:
9084; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
9085; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
9086; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
9087; SKX-NEXT:    retq # sched: [7:1.00]
9088  %vec2 = load <8 x float>, <8 x float>* %vec2p
9089  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
9090  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
9091  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
9092  ret <8 x float> %res
9093}
9094
9095define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
9096; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
9097; GENERIC:       # %bb.0:
9098; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
9099; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
9100; GENERIC-NEXT:    retq # sched: [1:1.00]
9101;
9102; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
9103; SKX:       # %bb.0:
9104; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
9105; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
9106; SKX-NEXT:    retq # sched: [7:1.00]
9107  %vec2 = load <8 x float>, <8 x float>* %vec2p
9108  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
9109  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
9110  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
9111  ret <8 x float> %res
9112}
9113
9114define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
9115; GENERIC-LABEL: test_16xfloat_shuff_mask0:
9116; GENERIC:       # %bb.0:
9117; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
9118; GENERIC-NEXT:    retq # sched: [1:1.00]
9119;
9120; SKX-LABEL: test_16xfloat_shuff_mask0:
9121; SKX:       # %bb.0:
9122; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
9123; SKX-NEXT:    retq # sched: [7:1.00]
9124  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
9125  ret <16 x float> %res
9126}
9127define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
9128; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
9129; GENERIC:       # %bb.0:
9130; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
9131; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
9132; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
9133; GENERIC-NEXT:    retq # sched: [1:1.00]
9134;
9135; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
9136; SKX:       # %bb.0:
9137; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
9138; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
9139; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
9140; SKX-NEXT:    retq # sched: [7:1.00]
9141  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
9142  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9143  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9144  ret <16 x float> %res
9145}
9146
9147define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
9148; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
9149; GENERIC:       # %bb.0:
9150; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9151; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
9152; GENERIC-NEXT:    retq # sched: [1:1.00]
9153;
9154; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
9155; SKX:       # %bb.0:
9156; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9157; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
9158; SKX-NEXT:    retq # sched: [7:1.00]
9159  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
9160  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9161  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9162  ret <16 x float> %res
9163}
9164define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
9165; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
9166; GENERIC:       # %bb.0:
9167; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
9168; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
9169; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
9170; GENERIC-NEXT:    retq # sched: [1:1.00]
9171;
9172; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
9173; SKX:       # %bb.0:
9174; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
9175; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
9176; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
9177; SKX-NEXT:    retq # sched: [7:1.00]
9178  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
9179  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9180  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9181  ret <16 x float> %res
9182}
9183
9184define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
9185; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
9186; GENERIC:       # %bb.0:
9187; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9188; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
9189; GENERIC-NEXT:    retq # sched: [1:1.00]
9190;
9191; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
9192; SKX:       # %bb.0:
9193; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9194; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
9195; SKX-NEXT:    retq # sched: [7:1.00]
9196  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
9197  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9198  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9199  ret <16 x float> %res
9200}
9201define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
9202; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
9203; GENERIC:       # %bb.0:
9204; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
9205; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
9206; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
9207; GENERIC-NEXT:    retq # sched: [1:1.00]
9208;
9209; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
9210; SKX:       # %bb.0:
9211; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
9212; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
9213; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
9214; SKX-NEXT:    retq # sched: [7:1.00]
9215  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
9216  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9217  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9218  ret <16 x float> %res
9219}
9220
9221define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
9222; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
9223; GENERIC:       # %bb.0:
9224; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9225; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
9226; GENERIC-NEXT:    retq # sched: [1:1.00]
9227;
9228; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
9229; SKX:       # %bb.0:
9230; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9231; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
9232; SKX-NEXT:    retq # sched: [7:1.00]
9233  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
9234  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9235  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9236  ret <16 x float> %res
9237}
9238define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
9239; GENERIC-LABEL: test_16xfloat_shuff_mask3:
9240; GENERIC:       # %bb.0:
9241; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
9242; GENERIC-NEXT:    retq # sched: [1:1.00]
9243;
9244; SKX-LABEL: test_16xfloat_shuff_mask3:
9245; SKX:       # %bb.0:
9246; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
9247; SKX-NEXT:    retq # sched: [7:1.00]
9248  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
9249  ret <16 x float> %res
9250}
9251define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
9252; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
9253; GENERIC:       # %bb.0:
9254; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
9255; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
9256; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
9257; GENERIC-NEXT:    retq # sched: [1:1.00]
9258;
9259; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
9260; SKX:       # %bb.0:
9261; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
9262; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
9263; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
9264; SKX-NEXT:    retq # sched: [7:1.00]
9265  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
9266  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9267  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9268  ret <16 x float> %res
9269}
9270
9271define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
9272; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
9273; GENERIC:       # %bb.0:
9274; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9275; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
9276; GENERIC-NEXT:    retq # sched: [1:1.00]
9277;
9278; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
9279; SKX:       # %bb.0:
9280; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9281; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
9282; SKX-NEXT:    retq # sched: [7:1.00]
9283  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
9284  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9285  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9286  ret <16 x float> %res
9287}
9288define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
9289; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0:
9290; GENERIC:       # %bb.0:
9291; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
9292; GENERIC-NEXT:    retq # sched: [1:1.00]
9293;
9294; SKX-LABEL: test_16xfloat_shuff_mem_mask0:
9295; SKX:       # %bb.0:
9296; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
9297; SKX-NEXT:    retq # sched: [7:1.00]
9298  %vec2 = load <16 x float>, <16 x float>* %vec2p
9299  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
9300  ret <16 x float> %res
9301}
9302define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
9303; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
9304; GENERIC:       # %bb.0:
9305; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9306; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
9307; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
9308; GENERIC-NEXT:    retq # sched: [1:1.00]
9309;
9310; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
9311; SKX:       # %bb.0:
9312; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9313; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
9314; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
9315; SKX-NEXT:    retq # sched: [7:1.00]
9316  %vec2 = load <16 x float>, <16 x float>* %vec2p
9317  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
9318  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9319  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9320  ret <16 x float> %res
9321}
9322
9323define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
9324; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
9325; GENERIC:       # %bb.0:
9326; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
9327; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
9328; GENERIC-NEXT:    retq # sched: [1:1.00]
9329;
9330; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
9331; SKX:       # %bb.0:
9332; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
9333; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
9334; SKX-NEXT:    retq # sched: [7:1.00]
9335  %vec2 = load <16 x float>, <16 x float>* %vec2p
9336  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
9337  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9338  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9339  ret <16 x float> %res
9340}
9341
9342define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
9343; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
9344; GENERIC:       # %bb.0:
9345; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9346; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
9347; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
9348; GENERIC-NEXT:    retq # sched: [1:1.00]
9349;
9350; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
9351; SKX:       # %bb.0:
9352; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9353; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
9354; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
9355; SKX-NEXT:    retq # sched: [7:1.00]
9356  %vec2 = load <16 x float>, <16 x float>* %vec2p
9357  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
9358  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9359  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9360  ret <16 x float> %res
9361}
9362
9363define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
9364; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
9365; GENERIC:       # %bb.0:
9366; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
9367; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
9368; GENERIC-NEXT:    retq # sched: [1:1.00]
9369;
9370; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
9371; SKX:       # %bb.0:
9372; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
9373; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
9374; SKX-NEXT:    retq # sched: [7:1.00]
9375  %vec2 = load <16 x float>, <16 x float>* %vec2p
9376  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
9377  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9378  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9379  ret <16 x float> %res
9380}
9381
9382define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
9383; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
9384; GENERIC:       # %bb.0:
9385; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9386; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
9387; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
9388; GENERIC-NEXT:    retq # sched: [1:1.00]
9389;
9390; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
9391; SKX:       # %bb.0:
9392; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9393; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
9394; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
9395; SKX-NEXT:    retq # sched: [7:1.00]
9396  %vec2 = load <16 x float>, <16 x float>* %vec2p
9397  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
9398  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9399  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9400  ret <16 x float> %res
9401}
9402
9403define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
9404; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
9405; GENERIC:       # %bb.0:
9406; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
9407; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
9408; GENERIC-NEXT:    retq # sched: [1:1.00]
9409;
9410; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
9411; SKX:       # %bb.0:
9412; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
9413; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
9414; SKX-NEXT:    retq # sched: [7:1.00]
9415  %vec2 = load <16 x float>, <16 x float>* %vec2p
9416  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
9417  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9418  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9419  ret <16 x float> %res
9420}
9421
9422define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
9423; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3:
9424; GENERIC:       # %bb.0:
9425; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
9426; GENERIC-NEXT:    retq # sched: [1:1.00]
9427;
9428; SKX-LABEL: test_16xfloat_shuff_mem_mask3:
9429; SKX:       # %bb.0:
9430; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
9431; SKX-NEXT:    retq # sched: [7:1.00]
9432  %vec2 = load <16 x float>, <16 x float>* %vec2p
9433  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
9434  ret <16 x float> %res
9435}
9436define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
9437; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
9438; GENERIC:       # %bb.0:
9439; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
9440; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
9441; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
9442; GENERIC-NEXT:    retq # sched: [1:1.00]
9443;
9444; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
9445; SKX:       # %bb.0:
9446; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
9447; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
9448; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
9449; SKX-NEXT:    retq # sched: [7:1.00]
9450  %vec2 = load <16 x float>, <16 x float>* %vec2p
9451  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
9452  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9453  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
9454  ret <16 x float> %res
9455}
9456
9457define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
9458; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
9459; GENERIC:       # %bb.0:
9460; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
9461; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
9462; GENERIC-NEXT:    retq # sched: [1:1.00]
9463;
9464; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
9465; SKX:       # %bb.0:
9466; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
9467; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
9468; SKX-NEXT:    retq # sched: [7:1.00]
9469  %vec2 = load <16 x float>, <16 x float>* %vec2p
9470  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
9471  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
9472  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
9473  ret <16 x float> %res
9474}
9475
9476define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
9477; GENERIC-LABEL: test_4xdouble_shuff_mask0:
9478; GENERIC:       # %bb.0:
9479; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
9480; GENERIC-NEXT:    retq # sched: [1:1.00]
9481;
9482; SKX-LABEL: test_4xdouble_shuff_mask0:
9483; SKX:       # %bb.0:
9484; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
9485; SKX-NEXT:    retq # sched: [7:1.00]
9486  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9487  ret <4 x double> %res
9488}
9489define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
9490; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
9491; GENERIC:       # %bb.0:
9492; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
9493; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
9494; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
9495; GENERIC-NEXT:    retq # sched: [1:1.00]
9496;
9497; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
9498; SKX:       # %bb.0:
9499; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
9500; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
9501; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
9502; SKX-NEXT:    retq # sched: [7:1.00]
9503  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9504  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9505  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9506  ret <4 x double> %res
9507}
9508
9509define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
9510; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
9511; GENERIC:       # %bb.0:
9512; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9513; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
9514; GENERIC-NEXT:    retq # sched: [1:1.00]
9515;
9516; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
9517; SKX:       # %bb.0:
9518; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9519; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
9520; SKX-NEXT:    retq # sched: [7:1.00]
9521  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9522  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9523  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9524  ret <4 x double> %res
9525}
9526define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
9527; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
9528; GENERIC:       # %bb.0:
9529; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
9530; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
9531; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
9532; GENERIC-NEXT:    retq # sched: [1:1.00]
9533;
9534; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
9535; SKX:       # %bb.0:
9536; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
9537; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
9538; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
9539; SKX-NEXT:    retq # sched: [7:1.00]
9540  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9541  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9542  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9543  ret <4 x double> %res
9544}
9545
9546define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
9547; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
9548; GENERIC:       # %bb.0:
9549; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9550; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
9551; GENERIC-NEXT:    retq # sched: [1:1.00]
9552;
9553; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
9554; SKX:       # %bb.0:
9555; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9556; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
9557; SKX-NEXT:    retq # sched: [7:1.00]
9558  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9559  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9560  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9561  ret <4 x double> %res
9562}
9563define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
9564; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
9565; GENERIC:       # %bb.0:
9566; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
9567; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
9568; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
9569; GENERIC-NEXT:    retq # sched: [1:1.00]
9570;
9571; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
9572; SKX:       # %bb.0:
9573; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
9574; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
9575; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
9576; SKX-NEXT:    retq # sched: [7:1.00]
9577  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9578  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9579  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9580  ret <4 x double> %res
9581}
9582
9583define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
9584; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
9585; GENERIC:       # %bb.0:
9586; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9587; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
9588; GENERIC-NEXT:    retq # sched: [1:1.00]
9589;
9590; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
9591; SKX:       # %bb.0:
9592; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9593; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
9594; SKX-NEXT:    retq # sched: [7:1.00]
9595  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9596  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9597  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9598  ret <4 x double> %res
9599}
9600define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
9601; GENERIC-LABEL: test_4xdouble_shuff_mask3:
9602; GENERIC:       # %bb.0:
9603; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
9604; GENERIC-NEXT:    retq # sched: [1:1.00]
9605;
9606; SKX-LABEL: test_4xdouble_shuff_mask3:
9607; SKX:       # %bb.0:
9608; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
9609; SKX-NEXT:    retq # sched: [7:1.00]
9610  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9611  ret <4 x double> %res
9612}
9613define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
9614; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
9615; GENERIC:       # %bb.0:
9616; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
9617; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
9618; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
9619; GENERIC-NEXT:    retq # sched: [1:1.00]
9620;
9621; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
9622; SKX:       # %bb.0:
9623; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
9624; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
9625; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
9626; SKX-NEXT:    retq # sched: [7:1.00]
9627  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9628  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9629  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9630  ret <4 x double> %res
9631}
9632
9633define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
9634; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
9635; GENERIC:       # %bb.0:
9636; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9637; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
9638; GENERIC-NEXT:    retq # sched: [1:1.00]
9639;
9640; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
9641; SKX:       # %bb.0:
9642; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9643; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
9644; SKX-NEXT:    retq # sched: [7:1.00]
9645  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9646  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9647  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9648  ret <4 x double> %res
9649}
9650define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
9651; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0:
9652; GENERIC:       # %bb.0:
9653; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
9654; GENERIC-NEXT:    retq # sched: [1:1.00]
9655;
9656; SKX-LABEL: test_4xdouble_shuff_mem_mask0:
9657; SKX:       # %bb.0:
9658; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
9659; SKX-NEXT:    retq # sched: [7:1.00]
9660  %vec2 = load <4 x double>, <4 x double>* %vec2p
9661  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9662  ret <4 x double> %res
9663}
9664define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
9665; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
9666; GENERIC:       # %bb.0:
9667; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9668; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
9669; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
9670; GENERIC-NEXT:    retq # sched: [1:1.00]
9671;
9672; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
9673; SKX:       # %bb.0:
9674; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9675; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
9676; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
9677; SKX-NEXT:    retq # sched: [7:1.00]
9678  %vec2 = load <4 x double>, <4 x double>* %vec2p
9679  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9680  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9681  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9682  ret <4 x double> %res
9683}
9684
9685define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
9686; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
9687; GENERIC:       # %bb.0:
9688; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
9689; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
9690; GENERIC-NEXT:    retq # sched: [1:1.00]
9691;
9692; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
9693; SKX:       # %bb.0:
9694; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
9695; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
9696; SKX-NEXT:    retq # sched: [7:1.00]
9697  %vec2 = load <4 x double>, <4 x double>* %vec2p
9698  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9699  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9700  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9701  ret <4 x double> %res
9702}
9703
9704define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
9705; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
9706; GENERIC:       # %bb.0:
9707; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9708; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
9709; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
9710; GENERIC-NEXT:    retq # sched: [1:1.00]
9711;
9712; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
9713; SKX:       # %bb.0:
9714; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9715; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
9716; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
9717; SKX-NEXT:    retq # sched: [7:1.00]
9718  %vec2 = load <4 x double>, <4 x double>* %vec2p
9719  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9720  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9721  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9722  ret <4 x double> %res
9723}
9724
9725define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
9726; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
9727; GENERIC:       # %bb.0:
9728; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
9729; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
9730; GENERIC-NEXT:    retq # sched: [1:1.00]
9731;
9732; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
9733; SKX:       # %bb.0:
9734; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
9735; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
9736; SKX-NEXT:    retq # sched: [7:1.00]
9737  %vec2 = load <4 x double>, <4 x double>* %vec2p
9738  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9739  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9740  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9741  ret <4 x double> %res
9742}
9743
9744define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
9745; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
9746; GENERIC:       # %bb.0:
9747; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9748; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
9749; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
9750; GENERIC-NEXT:    retq # sched: [1:1.00]
9751;
9752; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
9753; SKX:       # %bb.0:
9754; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9755; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
9756; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
9757; SKX-NEXT:    retq # sched: [7:1.00]
9758  %vec2 = load <4 x double>, <4 x double>* %vec2p
9759  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9760  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9761  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9762  ret <4 x double> %res
9763}
9764
9765define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
9766; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
9767; GENERIC:       # %bb.0:
9768; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
9769; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
9770; GENERIC-NEXT:    retq # sched: [1:1.00]
9771;
9772; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
9773; SKX:       # %bb.0:
9774; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
9775; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
9776; SKX-NEXT:    retq # sched: [7:1.00]
9777  %vec2 = load <4 x double>, <4 x double>* %vec2p
9778  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
9779  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9780  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9781  ret <4 x double> %res
9782}
9783
9784define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
9785; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3:
9786; GENERIC:       # %bb.0:
9787; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
9788; GENERIC-NEXT:    retq # sched: [1:1.00]
9789;
9790; SKX-LABEL: test_4xdouble_shuff_mem_mask3:
9791; SKX:       # %bb.0:
9792; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
9793; SKX-NEXT:    retq # sched: [7:1.00]
9794  %vec2 = load <4 x double>, <4 x double>* %vec2p
9795  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9796  ret <4 x double> %res
9797}
9798define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
9799; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
9800; GENERIC:       # %bb.0:
9801; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
9802; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
9803; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
9804; GENERIC-NEXT:    retq # sched: [1:1.00]
9805;
9806; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
9807; SKX:       # %bb.0:
9808; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
9809; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
9810; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
9811; SKX-NEXT:    retq # sched: [7:1.00]
9812  %vec2 = load <4 x double>, <4 x double>* %vec2p
9813  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9814  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9815  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
9816  ret <4 x double> %res
9817}
9818
9819define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
9820; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
9821; GENERIC:       # %bb.0:
9822; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
9823; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
9824; GENERIC-NEXT:    retq # sched: [1:1.00]
9825;
9826; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
9827; SKX:       # %bb.0:
9828; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
9829; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
9830; SKX-NEXT:    retq # sched: [7:1.00]
9831  %vec2 = load <4 x double>, <4 x double>* %vec2p
9832  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
9833  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
9834  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
9835  ret <4 x double> %res
9836}
9837
9838define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
9839; GENERIC-LABEL: test_8xdouble_shuff_mask0:
9840; GENERIC:       # %bb.0:
9841; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
9842; GENERIC-NEXT:    retq # sched: [1:1.00]
9843;
9844; SKX-LABEL: test_8xdouble_shuff_mask0:
9845; SKX:       # %bb.0:
9846; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
9847; SKX-NEXT:    retq # sched: [7:1.00]
9848  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
9849  ret <8 x double> %res
9850}
9851define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
9852; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
9853; GENERIC:       # %bb.0:
9854; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
9855; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
9856; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
9857; GENERIC-NEXT:    retq # sched: [1:1.00]
9858;
9859; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
9860; SKX:       # %bb.0:
9861; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
9862; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
9863; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
9864; SKX-NEXT:    retq # sched: [7:1.00]
9865  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
9866  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
9867  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
9868  ret <8 x double> %res
9869}
9870
9871define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
9872; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
9873; GENERIC:       # %bb.0:
9874; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
9875; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
9876; GENERIC-NEXT:    retq # sched: [1:1.00]
9877;
9878; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
9879; SKX:       # %bb.0:
9880; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
9881; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
9882; SKX-NEXT:    retq # sched: [7:1.00]
9883  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
9884  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
9885  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
9886  ret <8 x double> %res
9887}
9888define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
9889; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
9890; GENERIC:       # %bb.0:
9891; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
9892; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
9893; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
9894; GENERIC-NEXT:    retq # sched: [1:1.00]
9895;
9896; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
9897; SKX:       # %bb.0:
9898; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
9899; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
9900; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
9901; SKX-NEXT:    retq # sched: [7:1.00]
9902  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
9903  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
9904  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
9905  ret <8 x double> %res
9906}
9907
9908define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
9909; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
9910; GENERIC:       # %bb.0:
9911; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
9912; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
9913; GENERIC-NEXT:    retq # sched: [1:1.00]
9914;
9915; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
9916; SKX:       # %bb.0:
9917; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
9918; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
9919; SKX-NEXT:    retq # sched: [7:1.00]
9920  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
9921  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
9922  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
9923  ret <8 x double> %res
9924}
9925define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
9926; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
9927; GENERIC:       # %bb.0:
9928; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
9929; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
9930; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
9931; GENERIC-NEXT:    retq # sched: [1:1.00]
9932;
9933; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
9934; SKX:       # %bb.0:
9935; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
9936; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
9937; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
9938; SKX-NEXT:    retq # sched: [7:1.00]
9939  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
9940  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
9941  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
9942  ret <8 x double> %res
9943}
9944
9945define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
9946; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
9947; GENERIC:       # %bb.0:
9948; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
9949; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
9950; GENERIC-NEXT:    retq # sched: [1:1.00]
9951;
9952; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
9953; SKX:       # %bb.0:
9954; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
9955; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
9956; SKX-NEXT:    retq # sched: [7:1.00]
9957  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
9958  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
9959  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
9960  ret <8 x double> %res
9961}
9962define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
9963; GENERIC-LABEL: test_8xdouble_shuff_mask3:
9964; GENERIC:       # %bb.0:
9965; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
9966; GENERIC-NEXT:    retq # sched: [1:1.00]
9967;
9968; SKX-LABEL: test_8xdouble_shuff_mask3:
9969; SKX:       # %bb.0:
9970; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
9971; SKX-NEXT:    retq # sched: [7:1.00]
9972  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
9973  ret <8 x double> %res
9974}
9975define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
9976; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
9977; GENERIC:       # %bb.0:
9978; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
9979; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
9980; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
9981; GENERIC-NEXT:    retq # sched: [1:1.00]
9982;
9983; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
9984; SKX:       # %bb.0:
9985; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
9986; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
9987; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
9988; SKX-NEXT:    retq # sched: [7:1.00]
9989  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
9990  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
9991  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
9992  ret <8 x double> %res
9993}
9994
9995define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
9996; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
9997; GENERIC:       # %bb.0:
9998; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
9999; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
10000; GENERIC-NEXT:    retq # sched: [1:1.00]
10001;
10002; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
10003; SKX:       # %bb.0:
10004; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
10005; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
10006; SKX-NEXT:    retq # sched: [7:1.00]
10007  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
10008  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10009  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
10010  ret <8 x double> %res
10011}
10012define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
10013; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0:
10014; GENERIC:       # %bb.0:
10015; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
10016; GENERIC-NEXT:    retq # sched: [1:1.00]
10017;
10018; SKX-LABEL: test_8xdouble_shuff_mem_mask0:
10019; SKX:       # %bb.0:
10020; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
10021; SKX-NEXT:    retq # sched: [7:1.00]
10022  %vec2 = load <8 x double>, <8 x double>* %vec2p
10023  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
10024  ret <8 x double> %res
10025}
10026define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
10027; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
10028; GENERIC:       # %bb.0:
10029; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
10030; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
10031; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
10032; GENERIC-NEXT:    retq # sched: [1:1.00]
10033;
10034; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
10035; SKX:       # %bb.0:
10036; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
10037; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
10038; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
10039; SKX-NEXT:    retq # sched: [7:1.00]
10040  %vec2 = load <8 x double>, <8 x double>* %vec2p
10041  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
10042  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10043  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
10044  ret <8 x double> %res
10045}
10046
10047define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
10048; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
10049; GENERIC:       # %bb.0:
10050; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
10051; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
10052; GENERIC-NEXT:    retq # sched: [1:1.00]
10053;
10054; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
10055; SKX:       # %bb.0:
10056; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
10057; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
10058; SKX-NEXT:    retq # sched: [7:1.00]
10059  %vec2 = load <8 x double>, <8 x double>* %vec2p
10060  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
10061  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10062  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
10063  ret <8 x double> %res
10064}
10065
10066define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
10067; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
10068; GENERIC:       # %bb.0:
10069; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
10070; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
10071; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
10072; GENERIC-NEXT:    retq # sched: [1:1.00]
10073;
10074; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
10075; SKX:       # %bb.0:
10076; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
10077; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
10078; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
10079; SKX-NEXT:    retq # sched: [7:1.00]
10080  %vec2 = load <8 x double>, <8 x double>* %vec2p
10081  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10082  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10083  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
10084  ret <8 x double> %res
10085}
10086
10087define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
10088; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
10089; GENERIC:       # %bb.0:
10090; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
10091; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
10092; GENERIC-NEXT:    retq # sched: [1:1.00]
10093;
10094; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
10095; SKX:       # %bb.0:
10096; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
10097; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
10098; SKX-NEXT:    retq # sched: [7:1.00]
10099  %vec2 = load <8 x double>, <8 x double>* %vec2p
10100  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10101  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10102  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
10103  ret <8 x double> %res
10104}
10105
10106define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
10107; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
10108; GENERIC:       # %bb.0:
10109; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
10110; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
10111; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
10112; GENERIC-NEXT:    retq # sched: [1:1.00]
10113;
10114; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
10115; SKX:       # %bb.0:
10116; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
10117; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
10118; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
10119; SKX-NEXT:    retq # sched: [7:1.00]
10120  %vec2 = load <8 x double>, <8 x double>* %vec2p
10121  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
10122  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10123  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
10124  ret <8 x double> %res
10125}
10126
10127define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
10128; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
10129; GENERIC:       # %bb.0:
10130; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
10131; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
10132; GENERIC-NEXT:    retq # sched: [1:1.00]
10133;
10134; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
10135; SKX:       # %bb.0:
10136; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
10137; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
10138; SKX-NEXT:    retq # sched: [7:1.00]
10139  %vec2 = load <8 x double>, <8 x double>* %vec2p
10140  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
10141  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10142  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
10143  ret <8 x double> %res
10144}
10145
10146define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
10147; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3:
10148; GENERIC:       # %bb.0:
10149; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
10150; GENERIC-NEXT:    retq # sched: [1:1.00]
10151;
10152; SKX-LABEL: test_8xdouble_shuff_mem_mask3:
10153; SKX:       # %bb.0:
10154; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
10155; SKX-NEXT:    retq # sched: [7:1.00]
10156  %vec2 = load <8 x double>, <8 x double>* %vec2p
10157  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
10158  ret <8 x double> %res
10159}
10160define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
10161; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
10162; GENERIC:       # %bb.0:
10163; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
10164; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
10165; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
10166; GENERIC-NEXT:    retq # sched: [1:1.00]
10167;
10168; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
10169; SKX:       # %bb.0:
10170; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
10171; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
10172; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
10173; SKX-NEXT:    retq # sched: [7:1.00]
10174  %vec2 = load <8 x double>, <8 x double>* %vec2p
10175  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
10176  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10177  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
10178  ret <8 x double> %res
10179}
10180
10181define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
10182; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
10183; GENERIC:       # %bb.0:
10184; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
10185; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
10186; GENERIC-NEXT:    retq # sched: [1:1.00]
10187;
10188; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
10189; SKX:       # %bb.0:
10190; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
10191; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
10192; SKX-NEXT:    retq # sched: [7:1.00]
10193  %vec2 = load <8 x double>, <8 x double>* %vec2p
10194  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
10195  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
10196  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
10197  ret <8 x double> %res
10198}
10199
10200define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
10201; GENERIC-LABEL: test_8xi32_shuff_mask0:
10202; GENERIC:       # %bb.0:
10203; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
10204; GENERIC-NEXT:    retq # sched: [1:1.00]
10205;
10206; SKX-LABEL: test_8xi32_shuff_mask0:
10207; SKX:       # %bb.0:
10208; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
10209; SKX-NEXT:    retq # sched: [7:1.00]
10210  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10211  ret <8 x i32> %res
10212}
10213define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
10214; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
10215; GENERIC:       # %bb.0:
10216; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
10217; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
10218; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
10219; GENERIC-NEXT:    retq # sched: [1:1.00]
10220;
10221; SKX-LABEL: test_8xi32_masked_shuff_mask0:
10222; SKX:       # %bb.0:
10223; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
10224; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
10225; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
10226; SKX-NEXT:    retq # sched: [7:1.00]
10227  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10228  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10229  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10230  ret <8 x i32> %res
10231}
10232
10233define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
10234; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
10235; GENERIC:       # %bb.0:
10236; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10237; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
10238; GENERIC-NEXT:    retq # sched: [1:1.00]
10239;
10240; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
10241; SKX:       # %bb.0:
10242; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10243; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
10244; SKX-NEXT:    retq # sched: [7:1.00]
10245  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10246  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10247  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10248  ret <8 x i32> %res
10249}
10250define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
10251; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
10252; GENERIC:       # %bb.0:
10253; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
10254; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
10255; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
10256; GENERIC-NEXT:    retq # sched: [1:1.00]
10257;
10258; SKX-LABEL: test_8xi32_masked_shuff_mask1:
10259; SKX:       # %bb.0:
10260; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
10261; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
10262; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
10263; SKX-NEXT:    retq # sched: [7:1.00]
10264  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10265  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10266  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10267  ret <8 x i32> %res
10268}
10269
10270define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
10271; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
10272; GENERIC:       # %bb.0:
10273; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10274; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
10275; GENERIC-NEXT:    retq # sched: [1:1.00]
10276;
10277; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
10278; SKX:       # %bb.0:
10279; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10280; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
10281; SKX-NEXT:    retq # sched: [7:1.00]
10282  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10283  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10284  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10285  ret <8 x i32> %res
10286}
10287define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
10288; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
10289; GENERIC:       # %bb.0:
10290; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
10291; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
10292; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
10293; GENERIC-NEXT:    retq # sched: [1:1.00]
10294;
10295; SKX-LABEL: test_8xi32_masked_shuff_mask2:
10296; SKX:       # %bb.0:
10297; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
10298; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
10299; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
10300; SKX-NEXT:    retq # sched: [7:1.00]
10301  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10302  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10303  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10304  ret <8 x i32> %res
10305}
10306
10307define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
10308; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
10309; GENERIC:       # %bb.0:
10310; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10311; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
10312; GENERIC-NEXT:    retq # sched: [1:1.00]
10313;
10314; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
10315; SKX:       # %bb.0:
10316; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10317; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
10318; SKX-NEXT:    retq # sched: [7:1.00]
10319  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10320  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10321  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10322  ret <8 x i32> %res
10323}
10324define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
10325; GENERIC-LABEL: test_8xi32_shuff_mask3:
10326; GENERIC:       # %bb.0:
10327; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
10328; GENERIC-NEXT:    retq # sched: [1:1.00]
10329;
10330; SKX-LABEL: test_8xi32_shuff_mask3:
10331; SKX:       # %bb.0:
10332; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
10333; SKX-NEXT:    retq # sched: [7:1.00]
10334  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10335  ret <8 x i32> %res
10336}
10337define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
10338; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
10339; GENERIC:       # %bb.0:
10340; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
10341; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
10342; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
10343; GENERIC-NEXT:    retq # sched: [1:1.00]
10344;
10345; SKX-LABEL: test_8xi32_masked_shuff_mask3:
10346; SKX:       # %bb.0:
10347; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
10348; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
10349; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
10350; SKX-NEXT:    retq # sched: [7:1.00]
10351  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10352  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10353  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10354  ret <8 x i32> %res
10355}
10356
10357define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
10358; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
10359; GENERIC:       # %bb.0:
10360; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10361; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
10362; GENERIC-NEXT:    retq # sched: [1:1.00]
10363;
10364; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
10365; SKX:       # %bb.0:
10366; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10367; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
10368; SKX-NEXT:    retq # sched: [7:1.00]
10369  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10370  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10371  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10372  ret <8 x i32> %res
10373}
10374define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
10375; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
10376; GENERIC:       # %bb.0:
10377; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
10378; GENERIC-NEXT:    retq # sched: [1:1.00]
10379;
10380; SKX-LABEL: test_8xi32_shuff_mem_mask0:
10381; SKX:       # %bb.0:
10382; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
10383; SKX-NEXT:    retq # sched: [7:1.00]
10384  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10385  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10386  ret <8 x i32> %res
10387}
10388define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
10389; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
10390; GENERIC:       # %bb.0:
10391; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10392; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
10393; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
10394; GENERIC-NEXT:    retq # sched: [1:1.00]
10395;
10396; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
10397; SKX:       # %bb.0:
10398; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10399; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
10400; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
10401; SKX-NEXT:    retq # sched: [7:1.00]
10402  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10403  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10404  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10405  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10406  ret <8 x i32> %res
10407}
10408
10409define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
10410; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
10411; GENERIC:       # %bb.0:
10412; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
10413; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
10414; GENERIC-NEXT:    retq # sched: [1:1.00]
10415;
10416; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
10417; SKX:       # %bb.0:
10418; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
10419; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
10420; SKX-NEXT:    retq # sched: [7:1.00]
10421  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10422  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
10423  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10424  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10425  ret <8 x i32> %res
10426}
10427
10428define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
10429; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
10430; GENERIC:       # %bb.0:
10431; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10432; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
10433; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
10434; GENERIC-NEXT:    retq # sched: [1:1.00]
10435;
10436; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
10437; SKX:       # %bb.0:
10438; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10439; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
10440; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
10441; SKX-NEXT:    retq # sched: [7:1.00]
10442  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10443  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10444  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10445  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10446  ret <8 x i32> %res
10447}
10448
10449define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
10450; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
10451; GENERIC:       # %bb.0:
10452; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
10453; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
10454; GENERIC-NEXT:    retq # sched: [1:1.00]
10455;
10456; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
10457; SKX:       # %bb.0:
10458; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
10459; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
10460; SKX-NEXT:    retq # sched: [7:1.00]
10461  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10462  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10463  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10464  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10465  ret <8 x i32> %res
10466}
10467
10468define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
10469; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
10470; GENERIC:       # %bb.0:
10471; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10472; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
10473; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
10474; GENERIC-NEXT:    retq # sched: [1:1.00]
10475;
10476; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
10477; SKX:       # %bb.0:
10478; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10479; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
10480; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
10481; SKX-NEXT:    retq # sched: [7:1.00]
10482  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10483  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10484  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10485  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10486  ret <8 x i32> %res
10487}
10488
10489define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
10490; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
10491; GENERIC:       # %bb.0:
10492; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
10493; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
10494; GENERIC-NEXT:    retq # sched: [1:1.00]
10495;
10496; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
10497; SKX:       # %bb.0:
10498; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
10499; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
10500; SKX-NEXT:    retq # sched: [7:1.00]
10501  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10502  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10503  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10504  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10505  ret <8 x i32> %res
10506}
10507
10508define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
10509; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
10510; GENERIC:       # %bb.0:
10511; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
10512; GENERIC-NEXT:    retq # sched: [1:1.00]
10513;
10514; SKX-LABEL: test_8xi32_shuff_mem_mask3:
10515; SKX:       # %bb.0:
10516; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
10517; SKX-NEXT:    retq # sched: [7:1.00]
10518  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10519  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10520  ret <8 x i32> %res
10521}
10522define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
10523; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
10524; GENERIC:       # %bb.0:
10525; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
10526; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
10527; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
10528; GENERIC-NEXT:    retq # sched: [1:1.00]
10529;
10530; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
10531; SKX:       # %bb.0:
10532; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
10533; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
10534; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
10535; SKX-NEXT:    retq # sched: [7:1.00]
10536  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10537  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10538  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10539  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
10540  ret <8 x i32> %res
10541}
10542
10543define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
10544; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
10545; GENERIC:       # %bb.0:
10546; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
10547; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
10548; GENERIC-NEXT:    retq # sched: [1:1.00]
10549;
10550; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
10551; SKX:       # %bb.0:
10552; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
10553; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
10554; SKX-NEXT:    retq # sched: [7:1.00]
10555  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
10556  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
10557  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
10558  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
10559  ret <8 x i32> %res
10560}
10561
10562define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
10563; GENERIC-LABEL: test_16xi32_shuff_mask0:
10564; GENERIC:       # %bb.0:
10565; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
10566; GENERIC-NEXT:    retq # sched: [1:1.00]
10567;
10568; SKX-LABEL: test_16xi32_shuff_mask0:
10569; SKX:       # %bb.0:
10570; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
10571; SKX-NEXT:    retq # sched: [7:1.00]
10572  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
10573  ret <16 x i32> %res
10574}
10575define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
10576; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
10577; GENERIC:       # %bb.0:
10578; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
10579; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
10580; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
10581; GENERIC-NEXT:    retq # sched: [1:1.00]
10582;
10583; SKX-LABEL: test_16xi32_masked_shuff_mask0:
10584; SKX:       # %bb.0:
10585; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
10586; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
10587; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
10588; SKX-NEXT:    retq # sched: [7:1.00]
10589  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
10590  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10591  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10592  ret <16 x i32> %res
10593}
10594
10595define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
10596; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
10597; GENERIC:       # %bb.0:
10598; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10599; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
10600; GENERIC-NEXT:    retq # sched: [1:1.00]
10601;
10602; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
10603; SKX:       # %bb.0:
10604; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10605; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
10606; SKX-NEXT:    retq # sched: [7:1.00]
10607  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
10608  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10609  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10610  ret <16 x i32> %res
10611}
10612define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
10613; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
10614; GENERIC:       # %bb.0:
10615; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
10616; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
10617; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
10618; GENERIC-NEXT:    retq # sched: [1:1.00]
10619;
10620; SKX-LABEL: test_16xi32_masked_shuff_mask1:
10621; SKX:       # %bb.0:
10622; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
10623; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
10624; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
10625; SKX-NEXT:    retq # sched: [7:1.00]
10626  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
10627  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10628  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10629  ret <16 x i32> %res
10630}
10631
10632define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
10633; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
10634; GENERIC:       # %bb.0:
10635; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10636; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
10637; GENERIC-NEXT:    retq # sched: [1:1.00]
10638;
10639; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
10640; SKX:       # %bb.0:
10641; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10642; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
10643; SKX-NEXT:    retq # sched: [7:1.00]
10644  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
10645  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10646  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10647  ret <16 x i32> %res
10648}
10649define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
10650; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
10651; GENERIC:       # %bb.0:
10652; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
10653; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
10654; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
10655; GENERIC-NEXT:    retq # sched: [1:1.00]
10656;
10657; SKX-LABEL: test_16xi32_masked_shuff_mask2:
10658; SKX:       # %bb.0:
10659; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
10660; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
10661; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
10662; SKX-NEXT:    retq # sched: [7:1.00]
10663  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
10664  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10665  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10666  ret <16 x i32> %res
10667}
10668
10669define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
10670; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
10671; GENERIC:       # %bb.0:
10672; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10673; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
10674; GENERIC-NEXT:    retq # sched: [1:1.00]
10675;
10676; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
10677; SKX:       # %bb.0:
10678; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10679; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
10680; SKX-NEXT:    retq # sched: [7:1.00]
10681  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
10682  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10683  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10684  ret <16 x i32> %res
10685}
10686define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
10687; GENERIC-LABEL: test_16xi32_shuff_mask3:
10688; GENERIC:       # %bb.0:
10689; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
10690; GENERIC-NEXT:    retq # sched: [1:1.00]
10691;
10692; SKX-LABEL: test_16xi32_shuff_mask3:
10693; SKX:       # %bb.0:
10694; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
10695; SKX-NEXT:    retq # sched: [7:1.00]
10696  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
10697  ret <16 x i32> %res
10698}
10699define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
10700; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
10701; GENERIC:       # %bb.0:
10702; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
10703; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
10704; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
10705; GENERIC-NEXT:    retq # sched: [1:1.00]
10706;
10707; SKX-LABEL: test_16xi32_masked_shuff_mask3:
10708; SKX:       # %bb.0:
10709; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
10710; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
10711; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
10712; SKX-NEXT:    retq # sched: [7:1.00]
10713  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
10714  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10715  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10716  ret <16 x i32> %res
10717}
10718
10719define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
10720; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
10721; GENERIC:       # %bb.0:
10722; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10723; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
10724; GENERIC-NEXT:    retq # sched: [1:1.00]
10725;
10726; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
10727; SKX:       # %bb.0:
10728; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10729; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
10730; SKX-NEXT:    retq # sched: [7:1.00]
10731  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
10732  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10733  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10734  ret <16 x i32> %res
10735}
10736define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
10737; GENERIC-LABEL: test_16xi32_shuff_mem_mask0:
10738; GENERIC:       # %bb.0:
10739; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
10740; GENERIC-NEXT:    retq # sched: [1:1.00]
10741;
10742; SKX-LABEL: test_16xi32_shuff_mem_mask0:
10743; SKX:       # %bb.0:
10744; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
10745; SKX-NEXT:    retq # sched: [7:1.00]
10746  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10747  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
10748  ret <16 x i32> %res
10749}
10750define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
10751; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
10752; GENERIC:       # %bb.0:
10753; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10754; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
10755; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
10756; GENERIC-NEXT:    retq # sched: [1:1.00]
10757;
10758; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
10759; SKX:       # %bb.0:
10760; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10761; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
10762; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
10763; SKX-NEXT:    retq # sched: [7:1.00]
10764  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10765  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
10766  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10767  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10768  ret <16 x i32> %res
10769}
10770
10771define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
10772; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
10773; GENERIC:       # %bb.0:
10774; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
10775; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
10776; GENERIC-NEXT:    retq # sched: [1:1.00]
10777;
10778; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
10779; SKX:       # %bb.0:
10780; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
10781; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
10782; SKX-NEXT:    retq # sched: [7:1.00]
10783  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10784  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
10785  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10786  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10787  ret <16 x i32> %res
10788}
10789
10790define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
10791; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
10792; GENERIC:       # %bb.0:
10793; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10794; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
10795; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
10796; GENERIC-NEXT:    retq # sched: [1:1.00]
10797;
10798; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
10799; SKX:       # %bb.0:
10800; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10801; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
10802; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
10803; SKX-NEXT:    retq # sched: [7:1.00]
10804  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10805  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
10806  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10807  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10808  ret <16 x i32> %res
10809}
10810
10811define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
10812; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
10813; GENERIC:       # %bb.0:
10814; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
10815; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
10816; GENERIC-NEXT:    retq # sched: [1:1.00]
10817;
10818; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
10819; SKX:       # %bb.0:
10820; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
10821; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
10822; SKX-NEXT:    retq # sched: [7:1.00]
10823  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10824  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
10825  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10826  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10827  ret <16 x i32> %res
10828}
10829
10830define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
10831; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
10832; GENERIC:       # %bb.0:
10833; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10834; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
10835; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
10836; GENERIC-NEXT:    retq # sched: [1:1.00]
10837;
10838; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
10839; SKX:       # %bb.0:
10840; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10841; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
10842; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
10843; SKX-NEXT:    retq # sched: [7:1.00]
10844  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10845  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
10846  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10847  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10848  ret <16 x i32> %res
10849}
10850
10851define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
10852; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
10853; GENERIC:       # %bb.0:
10854; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
10855; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
10856; GENERIC-NEXT:    retq # sched: [1:1.00]
10857;
10858; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
10859; SKX:       # %bb.0:
10860; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
10861; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
10862; SKX-NEXT:    retq # sched: [7:1.00]
10863  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10864  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
10865  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10866  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10867  ret <16 x i32> %res
10868}
10869
10870define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
10871; GENERIC-LABEL: test_16xi32_shuff_mem_mask3:
10872; GENERIC:       # %bb.0:
10873; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
10874; GENERIC-NEXT:    retq # sched: [1:1.00]
10875;
10876; SKX-LABEL: test_16xi32_shuff_mem_mask3:
10877; SKX:       # %bb.0:
10878; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
10879; SKX-NEXT:    retq # sched: [7:1.00]
10880  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10881  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
10882  ret <16 x i32> %res
10883}
10884define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
10885; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
10886; GENERIC:       # %bb.0:
10887; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
10888; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
10889; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
10890; GENERIC-NEXT:    retq # sched: [1:1.00]
10891;
10892; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
10893; SKX:       # %bb.0:
10894; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
10895; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
10896; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
10897; SKX-NEXT:    retq # sched: [7:1.00]
10898  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10899  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
10900  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10901  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
10902  ret <16 x i32> %res
10903}
10904
10905define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
10906; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
10907; GENERIC:       # %bb.0:
10908; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
10909; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
10910; GENERIC-NEXT:    retq # sched: [1:1.00]
10911;
10912; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
10913; SKX:       # %bb.0:
10914; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
10915; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
10916; SKX-NEXT:    retq # sched: [7:1.00]
10917  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
10918  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
10919  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
10920  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
10921  ret <16 x i32> %res
10922}
10923
10924define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
10925; GENERIC-LABEL: test_4xi64_shuff_mask0:
10926; GENERIC:       # %bb.0:
10927; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
10928; GENERIC-NEXT:    retq # sched: [1:1.00]
10929;
10930; SKX-LABEL: test_4xi64_shuff_mask0:
10931; SKX:       # %bb.0:
10932; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
10933; SKX-NEXT:    retq # sched: [7:1.00]
10934  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
10935  ret <4 x i64> %res
10936}
10937define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
10938; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
10939; GENERIC:       # %bb.0:
10940; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
10941; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
10942; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
10943; GENERIC-NEXT:    retq # sched: [1:1.00]
10944;
10945; SKX-LABEL: test_4xi64_masked_shuff_mask0:
10946; SKX:       # %bb.0:
10947; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
10948; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
10949; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
10950; SKX-NEXT:    retq # sched: [7:1.00]
10951  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
10952  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
10953  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
10954  ret <4 x i64> %res
10955}
10956
10957define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
10958; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
10959; GENERIC:       # %bb.0:
10960; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
10961; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
10962; GENERIC-NEXT:    retq # sched: [1:1.00]
10963;
10964; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
10965; SKX:       # %bb.0:
10966; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
10967; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
10968; SKX-NEXT:    retq # sched: [7:1.00]
10969  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
10970  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
10971  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
10972  ret <4 x i64> %res
10973}
10974define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
10975; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
10976; GENERIC:       # %bb.0:
10977; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
10978; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
10979; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
10980; GENERIC-NEXT:    retq # sched: [1:1.00]
10981;
10982; SKX-LABEL: test_4xi64_masked_shuff_mask1:
10983; SKX:       # %bb.0:
10984; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
10985; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
10986; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
10987; SKX-NEXT:    retq # sched: [7:1.00]
10988  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
10989  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
10990  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
10991  ret <4 x i64> %res
10992}
10993
10994define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
10995; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
10996; GENERIC:       # %bb.0:
10997; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
10998; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
10999; GENERIC-NEXT:    retq # sched: [1:1.00]
11000;
11001; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
11002; SKX:       # %bb.0:
11003; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
11004; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
11005; SKX-NEXT:    retq # sched: [7:1.00]
11006  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11007  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11008  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
11009  ret <4 x i64> %res
11010}
11011define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
11012; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
11013; GENERIC:       # %bb.0:
11014; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
11015; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
11016; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
11017; GENERIC-NEXT:    retq # sched: [1:1.00]
11018;
11019; SKX-LABEL: test_4xi64_masked_shuff_mask2:
11020; SKX:       # %bb.0:
11021; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
11022; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
11023; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
11024; SKX-NEXT:    retq # sched: [7:1.00]
11025  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
11026  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11027  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
11028  ret <4 x i64> %res
11029}
11030
11031define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
11032; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
11033; GENERIC:       # %bb.0:
11034; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
11035; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
11036; GENERIC-NEXT:    retq # sched: [1:1.00]
11037;
11038; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
11039; SKX:       # %bb.0:
11040; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
11041; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
11042; SKX-NEXT:    retq # sched: [7:1.00]
11043  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
11044  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11045  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
11046  ret <4 x i64> %res
11047}
11048define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
11049; GENERIC-LABEL: test_4xi64_shuff_mask3:
11050; GENERIC:       # %bb.0:
11051; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
11052; GENERIC-NEXT:    retq # sched: [1:1.00]
11053;
11054; SKX-LABEL: test_4xi64_shuff_mask3:
11055; SKX:       # %bb.0:
11056; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
11057; SKX-NEXT:    retq # sched: [7:1.00]
11058  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11059  ret <4 x i64> %res
11060}
11061define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
11062; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
11063; GENERIC:       # %bb.0:
11064; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
11065; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
11066; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
11067; GENERIC-NEXT:    retq # sched: [1:1.00]
11068;
11069; SKX-LABEL: test_4xi64_masked_shuff_mask3:
11070; SKX:       # %bb.0:
11071; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
11072; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
11073; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
11074; SKX-NEXT:    retq # sched: [7:1.00]
11075  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11076  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11077  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
11078  ret <4 x i64> %res
11079}
11080
11081define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
11082; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
11083; GENERIC:       # %bb.0:
11084; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
11085; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
11086; GENERIC-NEXT:    retq # sched: [1:1.00]
11087;
11088; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
11089; SKX:       # %bb.0:
11090; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
11091; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
11092; SKX-NEXT:    retq # sched: [7:1.00]
11093  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11094  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11095  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
11096  ret <4 x i64> %res
11097}
11098define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
11099; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
11100; GENERIC:       # %bb.0:
11101; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
11102; GENERIC-NEXT:    retq # sched: [1:1.00]
11103;
11104; SKX-LABEL: test_4xi64_shuff_mem_mask0:
11105; SKX:       # %bb.0:
11106; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
11107; SKX-NEXT:    retq # sched: [7:1.00]
11108  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11109  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11110  ret <4 x i64> %res
11111}
11112define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
11113; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
11114; GENERIC:       # %bb.0:
11115; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
11116; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
11117; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
11118; GENERIC-NEXT:    retq # sched: [1:1.00]
11119;
11120; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
11121; SKX:       # %bb.0:
11122; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
11123; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
11124; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
11125; SKX-NEXT:    retq # sched: [7:1.00]
11126  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11127  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11128  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11129  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
11130  ret <4 x i64> %res
11131}
11132
11133define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
11134; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
11135; GENERIC:       # %bb.0:
11136; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
11137; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
11138; GENERIC-NEXT:    retq # sched: [1:1.00]
11139;
11140; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
11141; SKX:       # %bb.0:
11142; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
11143; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
11144; SKX-NEXT:    retq # sched: [7:1.00]
11145  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11146  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11147  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11148  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
11149  ret <4 x i64> %res
11150}
11151
11152define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
11153; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
11154; GENERIC:       # %bb.0:
11155; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
11156; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
11157; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
11158; GENERIC-NEXT:    retq # sched: [1:1.00]
11159;
11160; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
11161; SKX:       # %bb.0:
11162; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
11163; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
11164; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
11165; SKX-NEXT:    retq # sched: [7:1.00]
11166  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11167  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
11168  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11169  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
11170  ret <4 x i64> %res
11171}
11172
11173define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
11174; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
11175; GENERIC:       # %bb.0:
11176; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
11177; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
11178; GENERIC-NEXT:    retq # sched: [1:1.00]
11179;
11180; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
11181; SKX:       # %bb.0:
11182; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
11183; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
11184; SKX-NEXT:    retq # sched: [7:1.00]
11185  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11186  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
11187  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11188  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
11189  ret <4 x i64> %res
11190}
11191
11192define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
11193; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
11194; GENERIC:       # %bb.0:
11195; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
11196; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
11197; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
11198; GENERIC-NEXT:    retq # sched: [1:1.00]
11199;
11200; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
11201; SKX:       # %bb.0:
11202; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
11203; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
11204; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
11205; SKX-NEXT:    retq # sched: [7:1.00]
11206  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11207  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
11208  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11209  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
11210  ret <4 x i64> %res
11211}
11212
11213define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
11214; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
11215; GENERIC:       # %bb.0:
11216; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
11217; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
11218; GENERIC-NEXT:    retq # sched: [1:1.00]
11219;
11220; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
11221; SKX:       # %bb.0:
11222; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
11223; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
11224; SKX-NEXT:    retq # sched: [7:1.00]
11225  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11226  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
11227  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11228  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
11229  ret <4 x i64> %res
11230}
11231
11232define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
11233; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
11234; GENERIC:       # %bb.0:
11235; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
11236; GENERIC-NEXT:    retq # sched: [1:1.00]
11237;
11238; SKX-LABEL: test_4xi64_shuff_mem_mask3:
11239; SKX:       # %bb.0:
11240; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
11241; SKX-NEXT:    retq # sched: [7:1.00]
11242  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11243  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11244  ret <4 x i64> %res
11245}
11246define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
11247; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
11248; GENERIC:       # %bb.0:
11249; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
11250; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
11251; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
11252; GENERIC-NEXT:    retq # sched: [1:1.00]
11253;
11254; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
11255; SKX:       # %bb.0:
11256; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
11257; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
11258; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
11259; SKX-NEXT:    retq # sched: [7:1.00]
11260  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11261  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11262  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11263  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
11264  ret <4 x i64> %res
11265}
11266
11267define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
11268; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
11269; GENERIC:       # %bb.0:
11270; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
11271; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
11272; GENERIC-NEXT:    retq # sched: [1:1.00]
11273;
11274; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
11275; SKX:       # %bb.0:
11276; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
11277; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
11278; SKX-NEXT:    retq # sched: [7:1.00]
11279  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
11280  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
11281  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
11282  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
11283  ret <4 x i64> %res
11284}
11285
11286define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
11287; GENERIC-LABEL: test_8xi64_shuff_mask0:
11288; GENERIC:       # %bb.0:
11289; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
11290; GENERIC-NEXT:    retq # sched: [1:1.00]
11291;
11292; SKX-LABEL: test_8xi64_shuff_mask0:
11293; SKX:       # %bb.0:
11294; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
11295; SKX-NEXT:    retq # sched: [7:1.00]
11296  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
11297  ret <8 x i64> %res
11298}
11299define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
11300; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
11301; GENERIC:       # %bb.0:
11302; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
11303; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
11304; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
11305; GENERIC-NEXT:    retq # sched: [1:1.00]
11306;
11307; SKX-LABEL: test_8xi64_masked_shuff_mask0:
11308; SKX:       # %bb.0:
11309; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
11310; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
11311; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
11312; SKX-NEXT:    retq # sched: [7:1.00]
11313  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
11314  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11315  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11316  ret <8 x i64> %res
11317}
11318
11319define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
11320; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
11321; GENERIC:       # %bb.0:
11322; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11323; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
11324; GENERIC-NEXT:    retq # sched: [1:1.00]
11325;
11326; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
11327; SKX:       # %bb.0:
11328; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11329; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
11330; SKX-NEXT:    retq # sched: [7:1.00]
11331  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
11332  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11333  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11334  ret <8 x i64> %res
11335}
11336define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
11337; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
11338; GENERIC:       # %bb.0:
11339; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
11340; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
11341; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
11342; GENERIC-NEXT:    retq # sched: [1:1.00]
11343;
11344; SKX-LABEL: test_8xi64_masked_shuff_mask1:
11345; SKX:       # %bb.0:
11346; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
11347; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
11348; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
11349; SKX-NEXT:    retq # sched: [7:1.00]
11350  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
11351  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11352  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11353  ret <8 x i64> %res
11354}
11355
11356define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
11357; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
11358; GENERIC:       # %bb.0:
11359; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11360; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
11361; GENERIC-NEXT:    retq # sched: [1:1.00]
11362;
11363; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
11364; SKX:       # %bb.0:
11365; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11366; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
11367; SKX-NEXT:    retq # sched: [7:1.00]
11368  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
11369  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11370  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11371  ret <8 x i64> %res
11372}
11373define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
11374; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
11375; GENERIC:       # %bb.0:
11376; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
11377; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
11378; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
11379; GENERIC-NEXT:    retq # sched: [1:1.00]
11380;
11381; SKX-LABEL: test_8xi64_masked_shuff_mask2:
11382; SKX:       # %bb.0:
11383; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
11384; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
11385; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
11386; SKX-NEXT:    retq # sched: [7:1.00]
11387  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
11388  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11389  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11390  ret <8 x i64> %res
11391}
11392
11393define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
11394; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
11395; GENERIC:       # %bb.0:
11396; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11397; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
11398; GENERIC-NEXT:    retq # sched: [1:1.00]
11399;
11400; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
11401; SKX:       # %bb.0:
11402; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11403; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
11404; SKX-NEXT:    retq # sched: [7:1.00]
11405  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
11406  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11407  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11408  ret <8 x i64> %res
11409}
11410define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
11411; GENERIC-LABEL: test_8xi64_shuff_mask3:
11412; GENERIC:       # %bb.0:
11413; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
11414; GENERIC-NEXT:    retq # sched: [1:1.00]
11415;
11416; SKX-LABEL: test_8xi64_shuff_mask3:
11417; SKX:       # %bb.0:
11418; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
11419; SKX-NEXT:    retq # sched: [7:1.00]
11420  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
11421  ret <8 x i64> %res
11422}
11423define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
11424; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
11425; GENERIC:       # %bb.0:
11426; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
11427; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
11428; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
11429; GENERIC-NEXT:    retq # sched: [1:1.00]
11430;
11431; SKX-LABEL: test_8xi64_masked_shuff_mask3:
11432; SKX:       # %bb.0:
11433; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
11434; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
11435; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
11436; SKX-NEXT:    retq # sched: [7:1.00]
11437  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
11438  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11439  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11440  ret <8 x i64> %res
11441}
11442
11443define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
11444; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
11445; GENERIC:       # %bb.0:
11446; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11447; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
11448; GENERIC-NEXT:    retq # sched: [1:1.00]
11449;
11450; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
11451; SKX:       # %bb.0:
11452; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11453; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
11454; SKX-NEXT:    retq # sched: [7:1.00]
11455  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
11456  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11457  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11458  ret <8 x i64> %res
11459}
11460define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
11461; GENERIC-LABEL: test_8xi64_shuff_mem_mask0:
11462; GENERIC:       # %bb.0:
11463; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
11464; GENERIC-NEXT:    retq # sched: [1:1.00]
11465;
11466; SKX-LABEL: test_8xi64_shuff_mem_mask0:
11467; SKX:       # %bb.0:
11468; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
11469; SKX-NEXT:    retq # sched: [7:1.00]
11470  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11471  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
11472  ret <8 x i64> %res
11473}
11474define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
11475; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
11476; GENERIC:       # %bb.0:
11477; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11478; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
11479; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
11480; GENERIC-NEXT:    retq # sched: [1:1.00]
11481;
11482; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
11483; SKX:       # %bb.0:
11484; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11485; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
11486; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
11487; SKX-NEXT:    retq # sched: [7:1.00]
11488  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11489  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
11490  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11491  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11492  ret <8 x i64> %res
11493}
11494
11495define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
11496; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
11497; GENERIC:       # %bb.0:
11498; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
11499; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
11500; GENERIC-NEXT:    retq # sched: [1:1.00]
11501;
11502; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
11503; SKX:       # %bb.0:
11504; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
11505; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
11506; SKX-NEXT:    retq # sched: [7:1.00]
11507  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11508  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
11509  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11510  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11511  ret <8 x i64> %res
11512}
11513
11514define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
11515; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
11516; GENERIC:       # %bb.0:
11517; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11518; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
11519; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
11520; GENERIC-NEXT:    retq # sched: [1:1.00]
11521;
11522; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
11523; SKX:       # %bb.0:
11524; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11525; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
11526; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
11527; SKX-NEXT:    retq # sched: [7:1.00]
11528  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11529  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
11530  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11531  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11532  ret <8 x i64> %res
11533}
11534
11535define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
11536; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
11537; GENERIC:       # %bb.0:
11538; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
11539; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
11540; GENERIC-NEXT:    retq # sched: [1:1.00]
11541;
11542; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
11543; SKX:       # %bb.0:
11544; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
11545; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
11546; SKX-NEXT:    retq # sched: [7:1.00]
11547  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11548  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
11549  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11550  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11551  ret <8 x i64> %res
11552}
11553
11554define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
11555; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
11556; GENERIC:       # %bb.0:
11557; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11558; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
11559; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
11560; GENERIC-NEXT:    retq # sched: [1:1.00]
11561;
11562; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
11563; SKX:       # %bb.0:
11564; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11565; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
11566; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
11567; SKX-NEXT:    retq # sched: [7:1.00]
11568  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11569  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
11570  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11571  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11572  ret <8 x i64> %res
11573}
11574
11575define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
11576; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
11577; GENERIC:       # %bb.0:
11578; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
11579; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
11580; GENERIC-NEXT:    retq # sched: [1:1.00]
11581;
11582; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
11583; SKX:       # %bb.0:
11584; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
11585; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
11586; SKX-NEXT:    retq # sched: [7:1.00]
11587  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11588  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
11589  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11590  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11591  ret <8 x i64> %res
11592}
11593
11594define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
11595; GENERIC-LABEL: test_8xi64_shuff_mem_mask3:
11596; GENERIC:       # %bb.0:
11597; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
11598; GENERIC-NEXT:    retq # sched: [1:1.00]
11599;
11600; SKX-LABEL: test_8xi64_shuff_mem_mask3:
11601; SKX:       # %bb.0:
11602; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
11603; SKX-NEXT:    retq # sched: [7:1.00]
11604  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11605  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
11606  ret <8 x i64> %res
11607}
11608define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
11609; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
11610; GENERIC:       # %bb.0:
11611; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
11612; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
11613; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
11614; GENERIC-NEXT:    retq # sched: [1:1.00]
11615;
11616; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
11617; SKX:       # %bb.0:
11618; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
11619; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
11620; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
11621; SKX-NEXT:    retq # sched: [7:1.00]
11622  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11623  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
11624  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11625  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
11626  ret <8 x i64> %res
11627}
11628
11629define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
11630; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
11631; GENERIC:       # %bb.0:
11632; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
11633; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
11634; GENERIC-NEXT:    retq # sched: [1:1.00]
11635;
11636; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
11637; SKX:       # %bb.0:
11638; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
11639; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
11640; SKX-NEXT:    retq # sched: [7:1.00]
11641  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
11642  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
11643  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
11644  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
11645  ret <8 x i64> %res
11646}
11647
11648define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
11649; GENERIC-LABEL: test_4xfloat_unpack_low_mask0:
11650; GENERIC:       # %bb.0:
11651; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11652; GENERIC-NEXT:    retq # sched: [1:1.00]
11653;
11654; SKX-LABEL: test_4xfloat_unpack_low_mask0:
11655; SKX:       # %bb.0:
11656; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11657; SKX-NEXT:    retq # sched: [7:1.00]
11658  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11659  ret <4 x float> %res
11660}
11661define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
11662; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
11663; GENERIC:       # %bb.0:
11664; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
11665; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11666; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
11667; GENERIC-NEXT:    retq # sched: [1:1.00]
11668;
11669; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
11670; SKX:       # %bb.0:
11671; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
11672; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11673; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
11674; SKX-NEXT:    retq # sched: [7:1.00]
11675  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11676  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11677  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11678  ret <4 x float> %res
11679}
11680
11681define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
11682; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
11683; GENERIC:       # %bb.0:
11684; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11685; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11686; GENERIC-NEXT:    retq # sched: [1:1.00]
11687;
11688; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
11689; SKX:       # %bb.0:
11690; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11691; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11692; SKX-NEXT:    retq # sched: [7:1.00]
11693  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11694  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11695  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
11696  ret <4 x float> %res
11697}
11698define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
11699; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
11700; GENERIC:       # %bb.0:
11701; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
11702; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11703; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
11704; GENERIC-NEXT:    retq # sched: [1:1.00]
11705;
11706; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
11707; SKX:       # %bb.0:
11708; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
11709; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11710; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
11711; SKX-NEXT:    retq # sched: [7:1.00]
11712  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11713  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11714  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11715  ret <4 x float> %res
11716}
11717
11718define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
11719; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
11720; GENERIC:       # %bb.0:
11721; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11722; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11723; GENERIC-NEXT:    retq # sched: [1:1.00]
11724;
11725; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
11726; SKX:       # %bb.0:
11727; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11728; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11729; SKX-NEXT:    retq # sched: [7:1.00]
11730  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11731  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11732  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
11733  ret <4 x float> %res
11734}
11735define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
11736; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
11737; GENERIC:       # %bb.0:
11738; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
11739; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11740; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
11741; GENERIC-NEXT:    retq # sched: [1:1.00]
11742;
11743; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
11744; SKX:       # %bb.0:
11745; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
11746; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11747; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
11748; SKX-NEXT:    retq # sched: [7:1.00]
11749  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11750  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11751  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11752  ret <4 x float> %res
11753}
11754
11755define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
11756; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
11757; GENERIC:       # %bb.0:
11758; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11759; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11760; GENERIC-NEXT:    retq # sched: [1:1.00]
11761;
11762; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
11763; SKX:       # %bb.0:
11764; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11765; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11766; SKX-NEXT:    retq # sched: [7:1.00]
11767  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11768  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11769  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
11770  ret <4 x float> %res
11771}
11772define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
11773; GENERIC-LABEL: test_4xfloat_unpack_low_mask3:
11774; GENERIC:       # %bb.0:
11775; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11776; GENERIC-NEXT:    retq # sched: [1:1.00]
11777;
11778; SKX-LABEL: test_4xfloat_unpack_low_mask3:
11779; SKX:       # %bb.0:
11780; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11781; SKX-NEXT:    retq # sched: [7:1.00]
11782  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11783  ret <4 x float> %res
11784}
11785define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
11786; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
11787; GENERIC:       # %bb.0:
11788; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
11789; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11790; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
11791; GENERIC-NEXT:    retq # sched: [1:1.00]
11792;
11793; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
11794; SKX:       # %bb.0:
11795; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
11796; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11797; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
11798; SKX-NEXT:    retq # sched: [7:1.00]
11799  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11800  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11801  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11802  ret <4 x float> %res
11803}
11804
11805define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
11806; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
11807; GENERIC:       # %bb.0:
11808; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11809; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11810; GENERIC-NEXT:    retq # sched: [1:1.00]
11811;
11812; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
11813; SKX:       # %bb.0:
11814; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11815; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
11816; SKX-NEXT:    retq # sched: [7:1.00]
11817  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11818  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11819  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
11820  ret <4 x float> %res
11821}
11822define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
11823; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask0:
11824; GENERIC:       # %bb.0:
11825; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11826; GENERIC-NEXT:    retq # sched: [1:1.00]
11827;
11828; SKX-LABEL: test_4xfloat_unpack_low_mem_mask0:
11829; SKX:       # %bb.0:
11830; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11831; SKX-NEXT:    retq # sched: [7:1.00]
11832  %vec2 = load <4 x float>, <4 x float>* %vec2p
11833  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11834  ret <4 x float> %res
11835}
11836define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
11837; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
11838; GENERIC:       # %bb.0:
11839; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11840; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11841; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
11842; GENERIC-NEXT:    retq # sched: [1:1.00]
11843;
11844; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
11845; SKX:       # %bb.0:
11846; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11847; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11848; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
11849; SKX-NEXT:    retq # sched: [7:1.00]
11850  %vec2 = load <4 x float>, <4 x float>* %vec2p
11851  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11852  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11853  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11854  ret <4 x float> %res
11855}
11856
11857define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
11858; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
11859; GENERIC:       # %bb.0:
11860; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
11861; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11862; GENERIC-NEXT:    retq # sched: [1:1.00]
11863;
11864; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
11865; SKX:       # %bb.0:
11866; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
11867; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11868; SKX-NEXT:    retq # sched: [7:1.00]
11869  %vec2 = load <4 x float>, <4 x float>* %vec2p
11870  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11871  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11872  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
11873  ret <4 x float> %res
11874}
11875
11876define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
11877; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
11878; GENERIC:       # %bb.0:
11879; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11880; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11881; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
11882; GENERIC-NEXT:    retq # sched: [1:1.00]
11883;
11884; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
11885; SKX:       # %bb.0:
11886; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11887; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11888; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
11889; SKX-NEXT:    retq # sched: [7:1.00]
11890  %vec2 = load <4 x float>, <4 x float>* %vec2p
11891  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11892  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11893  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11894  ret <4 x float> %res
11895}
11896
11897define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
11898; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
11899; GENERIC:       # %bb.0:
11900; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
11901; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11902; GENERIC-NEXT:    retq # sched: [1:1.00]
11903;
11904; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
11905; SKX:       # %bb.0:
11906; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
11907; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11908; SKX-NEXT:    retq # sched: [7:1.00]
11909  %vec2 = load <4 x float>, <4 x float>* %vec2p
11910  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11911  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11912  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
11913  ret <4 x float> %res
11914}
11915
11916define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
11917; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
11918; GENERIC:       # %bb.0:
11919; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11920; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11921; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
11922; GENERIC-NEXT:    retq # sched: [1:1.00]
11923;
11924; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
11925; SKX:       # %bb.0:
11926; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11927; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11928; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
11929; SKX-NEXT:    retq # sched: [7:1.00]
11930  %vec2 = load <4 x float>, <4 x float>* %vec2p
11931  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11932  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11933  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11934  ret <4 x float> %res
11935}
11936
11937define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
11938; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
11939; GENERIC:       # %bb.0:
11940; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
11941; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11942; GENERIC-NEXT:    retq # sched: [1:1.00]
11943;
11944; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
11945; SKX:       # %bb.0:
11946; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
11947; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11948; SKX-NEXT:    retq # sched: [7:1.00]
11949  %vec2 = load <4 x float>, <4 x float>* %vec2p
11950  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11951  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11952  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
11953  ret <4 x float> %res
11954}
11955
11956define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
11957; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask3:
11958; GENERIC:       # %bb.0:
11959; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11960; GENERIC-NEXT:    retq # sched: [1:1.00]
11961;
11962; SKX-LABEL: test_4xfloat_unpack_low_mem_mask3:
11963; SKX:       # %bb.0:
11964; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11965; SKX-NEXT:    retq # sched: [7:1.00]
11966  %vec2 = load <4 x float>, <4 x float>* %vec2p
11967  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11968  ret <4 x float> %res
11969}
11970define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
11971; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
11972; GENERIC:       # %bb.0:
11973; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
11974; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11975; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
11976; GENERIC-NEXT:    retq # sched: [1:1.00]
11977;
11978; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
11979; SKX:       # %bb.0:
11980; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
11981; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11982; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
11983; SKX-NEXT:    retq # sched: [7:1.00]
11984  %vec2 = load <4 x float>, <4 x float>* %vec2p
11985  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
11986  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
11987  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
11988  ret <4 x float> %res
11989}
11990
11991define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
11992; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
11993; GENERIC:       # %bb.0:
11994; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
11995; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
11996; GENERIC-NEXT:    retq # sched: [1:1.00]
11997;
11998; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
11999; SKX:       # %bb.0:
12000; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
12001; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
12002; SKX-NEXT:    retq # sched: [7:1.00]
12003  %vec2 = load <4 x float>, <4 x float>* %vec2p
12004  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
12005  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
12006  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
12007  ret <4 x float> %res
12008}
12009
12010define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
12011; GENERIC-LABEL: test_8xfloat_unpack_low_mask0:
12012; GENERIC:       # %bb.0:
12013; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12014; GENERIC-NEXT:    retq # sched: [1:1.00]
12015;
12016; SKX-LABEL: test_8xfloat_unpack_low_mask0:
12017; SKX:       # %bb.0:
12018; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12019; SKX-NEXT:    retq # sched: [7:1.00]
12020  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12021  ret <8 x float> %res
12022}
12023define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
12024; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
12025; GENERIC:       # %bb.0:
12026; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
12027; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12028; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
12029; GENERIC-NEXT:    retq # sched: [1:1.00]
12030;
12031; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
12032; SKX:       # %bb.0:
12033; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
12034; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12035; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
12036; SKX-NEXT:    retq # sched: [7:1.00]
12037  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12038  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12039  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12040  ret <8 x float> %res
12041}
12042
12043define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
12044; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
12045; GENERIC:       # %bb.0:
12046; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12047; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12048; GENERIC-NEXT:    retq # sched: [1:1.00]
12049;
12050; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
12051; SKX:       # %bb.0:
12052; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12053; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12054; SKX-NEXT:    retq # sched: [7:1.00]
12055  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12056  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12057  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12058  ret <8 x float> %res
12059}
12060define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
12061; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
12062; GENERIC:       # %bb.0:
12063; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
12064; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12065; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
12066; GENERIC-NEXT:    retq # sched: [1:1.00]
12067;
12068; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
12069; SKX:       # %bb.0:
12070; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
12071; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12072; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
12073; SKX-NEXT:    retq # sched: [7:1.00]
12074  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12075  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12076  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12077  ret <8 x float> %res
12078}
12079
12080define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
12081; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
12082; GENERIC:       # %bb.0:
12083; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12084; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12085; GENERIC-NEXT:    retq # sched: [1:1.00]
12086;
12087; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
12088; SKX:       # %bb.0:
12089; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12090; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12091; SKX-NEXT:    retq # sched: [7:1.00]
12092  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12093  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12094  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12095  ret <8 x float> %res
12096}
12097define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
12098; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
12099; GENERIC:       # %bb.0:
12100; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
12101; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12102; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
12103; GENERIC-NEXT:    retq # sched: [1:1.00]
12104;
12105; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
12106; SKX:       # %bb.0:
12107; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
12108; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12109; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
12110; SKX-NEXT:    retq # sched: [7:1.00]
12111  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12112  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12113  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12114  ret <8 x float> %res
12115}
12116
12117define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
12118; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
12119; GENERIC:       # %bb.0:
12120; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12121; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12122; GENERIC-NEXT:    retq # sched: [1:1.00]
12123;
12124; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
12125; SKX:       # %bb.0:
12126; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12127; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12128; SKX-NEXT:    retq # sched: [7:1.00]
12129  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12130  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12131  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12132  ret <8 x float> %res
12133}
12134define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
12135; GENERIC-LABEL: test_8xfloat_unpack_low_mask3:
12136; GENERIC:       # %bb.0:
12137; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12138; GENERIC-NEXT:    retq # sched: [1:1.00]
12139;
12140; SKX-LABEL: test_8xfloat_unpack_low_mask3:
12141; SKX:       # %bb.0:
12142; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12143; SKX-NEXT:    retq # sched: [7:1.00]
12144  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12145  ret <8 x float> %res
12146}
12147define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
12148; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
12149; GENERIC:       # %bb.0:
12150; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
12151; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12152; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
12153; GENERIC-NEXT:    retq # sched: [1:1.00]
12154;
12155; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
12156; SKX:       # %bb.0:
12157; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
12158; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12159; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
12160; SKX-NEXT:    retq # sched: [7:1.00]
12161  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12162  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12163  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12164  ret <8 x float> %res
12165}
12166
12167define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
12168; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
12169; GENERIC:       # %bb.0:
12170; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12171; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12172; GENERIC-NEXT:    retq # sched: [1:1.00]
12173;
12174; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
12175; SKX:       # %bb.0:
12176; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12177; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
12178; SKX-NEXT:    retq # sched: [7:1.00]
12179  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12180  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12181  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12182  ret <8 x float> %res
12183}
12184define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
12185; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask0:
12186; GENERIC:       # %bb.0:
12187; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12188; GENERIC-NEXT:    retq # sched: [1:1.00]
12189;
12190; SKX-LABEL: test_8xfloat_unpack_low_mem_mask0:
12191; SKX:       # %bb.0:
12192; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12193; SKX-NEXT:    retq # sched: [7:1.00]
12194  %vec2 = load <8 x float>, <8 x float>* %vec2p
12195  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12196  ret <8 x float> %res
12197}
12198define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
12199; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
12200; GENERIC:       # %bb.0:
12201; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12202; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12203; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
12204; GENERIC-NEXT:    retq # sched: [1:1.00]
12205;
12206; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
12207; SKX:       # %bb.0:
12208; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12209; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12210; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
12211; SKX-NEXT:    retq # sched: [7:1.00]
12212  %vec2 = load <8 x float>, <8 x float>* %vec2p
12213  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12214  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12215  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12216  ret <8 x float> %res
12217}
12218
12219define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
12220; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
12221; GENERIC:       # %bb.0:
12222; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
12223; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12224; GENERIC-NEXT:    retq # sched: [1:1.00]
12225;
12226; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
12227; SKX:       # %bb.0:
12228; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
12229; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12230; SKX-NEXT:    retq # sched: [7:1.00]
12231  %vec2 = load <8 x float>, <8 x float>* %vec2p
12232  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12233  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12234  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12235  ret <8 x float> %res
12236}
12237
12238define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
12239; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
12240; GENERIC:       # %bb.0:
12241; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12242; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12243; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
12244; GENERIC-NEXT:    retq # sched: [1:1.00]
12245;
12246; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
12247; SKX:       # %bb.0:
12248; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12249; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12250; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
12251; SKX-NEXT:    retq # sched: [7:1.00]
12252  %vec2 = load <8 x float>, <8 x float>* %vec2p
12253  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12254  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12255  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12256  ret <8 x float> %res
12257}
12258
12259define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
12260; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
12261; GENERIC:       # %bb.0:
12262; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
12263; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12264; GENERIC-NEXT:    retq # sched: [1:1.00]
12265;
12266; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
12267; SKX:       # %bb.0:
12268; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
12269; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12270; SKX-NEXT:    retq # sched: [7:1.00]
12271  %vec2 = load <8 x float>, <8 x float>* %vec2p
12272  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12273  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12274  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12275  ret <8 x float> %res
12276}
12277
12278define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
12279; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
12280; GENERIC:       # %bb.0:
12281; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12282; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12283; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
12284; GENERIC-NEXT:    retq # sched: [1:1.00]
12285;
12286; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
12287; SKX:       # %bb.0:
12288; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12289; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12290; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
12291; SKX-NEXT:    retq # sched: [7:1.00]
12292  %vec2 = load <8 x float>, <8 x float>* %vec2p
12293  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12294  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12295  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12296  ret <8 x float> %res
12297}
12298
12299define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
12300; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
12301; GENERIC:       # %bb.0:
12302; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
12303; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12304; GENERIC-NEXT:    retq # sched: [1:1.00]
12305;
12306; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
12307; SKX:       # %bb.0:
12308; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
12309; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12310; SKX-NEXT:    retq # sched: [7:1.00]
12311  %vec2 = load <8 x float>, <8 x float>* %vec2p
12312  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12313  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12314  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12315  ret <8 x float> %res
12316}
12317
12318define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
12319; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask3:
12320; GENERIC:       # %bb.0:
12321; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12322; GENERIC-NEXT:    retq # sched: [1:1.00]
12323;
12324; SKX-LABEL: test_8xfloat_unpack_low_mem_mask3:
12325; SKX:       # %bb.0:
12326; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12327; SKX-NEXT:    retq # sched: [7:1.00]
12328  %vec2 = load <8 x float>, <8 x float>* %vec2p
12329  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12330  ret <8 x float> %res
12331}
12332define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
12333; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
12334; GENERIC:       # %bb.0:
12335; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
12336; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12337; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
12338; GENERIC-NEXT:    retq # sched: [1:1.00]
12339;
12340; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
12341; SKX:       # %bb.0:
12342; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
12343; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12344; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
12345; SKX-NEXT:    retq # sched: [7:1.00]
12346  %vec2 = load <8 x float>, <8 x float>* %vec2p
12347  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12348  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12349  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
12350  ret <8 x float> %res
12351}
12352
12353define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
12354; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
12355; GENERIC:       # %bb.0:
12356; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
12357; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12358; GENERIC-NEXT:    retq # sched: [1:1.00]
12359;
12360; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
12361; SKX:       # %bb.0:
12362; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
12363; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
12364; SKX-NEXT:    retq # sched: [7:1.00]
12365  %vec2 = load <8 x float>, <8 x float>* %vec2p
12366  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
12367  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
12368  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
12369  ret <8 x float> %res
12370}
12371
12372define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
12373; GENERIC-LABEL: test_16xfloat_unpack_low_mask0:
12374; GENERIC:       # %bb.0:
12375; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12376; GENERIC-NEXT:    retq # sched: [1:1.00]
12377;
12378; SKX-LABEL: test_16xfloat_unpack_low_mask0:
12379; SKX:       # %bb.0:
12380; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12381; SKX-NEXT:    retq # sched: [7:1.00]
12382  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12383  ret <16 x float> %res
12384}
12385define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
12386; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
12387; GENERIC:       # %bb.0:
12388; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
12389; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12390; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
12391; GENERIC-NEXT:    retq # sched: [1:1.00]
12392;
12393; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
12394; SKX:       # %bb.0:
12395; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
12396; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12397; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
12398; SKX-NEXT:    retq # sched: [7:1.00]
12399  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12400  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12401  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12402  ret <16 x float> %res
12403}
12404
12405define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
12406; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
12407; GENERIC:       # %bb.0:
12408; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12409; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12410; GENERIC-NEXT:    retq # sched: [1:1.00]
12411;
12412; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
12413; SKX:       # %bb.0:
12414; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12415; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12416; SKX-NEXT:    retq # sched: [7:1.00]
12417  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12418  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12419  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12420  ret <16 x float> %res
12421}
12422define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
12423; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
12424; GENERIC:       # %bb.0:
12425; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
12426; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12427; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
12428; GENERIC-NEXT:    retq # sched: [1:1.00]
12429;
12430; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
12431; SKX:       # %bb.0:
12432; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
12433; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12434; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
12435; SKX-NEXT:    retq # sched: [7:1.00]
12436  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12437  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12438  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12439  ret <16 x float> %res
12440}
12441
12442define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
12443; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
12444; GENERIC:       # %bb.0:
12445; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12446; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12447; GENERIC-NEXT:    retq # sched: [1:1.00]
12448;
12449; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
12450; SKX:       # %bb.0:
12451; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12452; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12453; SKX-NEXT:    retq # sched: [7:1.00]
12454  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12455  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12456  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12457  ret <16 x float> %res
12458}
12459define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
12460; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
12461; GENERIC:       # %bb.0:
12462; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
12463; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12464; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
12465; GENERIC-NEXT:    retq # sched: [1:1.00]
12466;
12467; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
12468; SKX:       # %bb.0:
12469; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
12470; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12471; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
12472; SKX-NEXT:    retq # sched: [7:1.00]
12473  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12474  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12475  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12476  ret <16 x float> %res
12477}
12478
12479define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
12480; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
12481; GENERIC:       # %bb.0:
12482; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12483; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12484; GENERIC-NEXT:    retq # sched: [1:1.00]
12485;
12486; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
12487; SKX:       # %bb.0:
12488; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12489; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12490; SKX-NEXT:    retq # sched: [7:1.00]
12491  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12492  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12493  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12494  ret <16 x float> %res
12495}
12496define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
12497; GENERIC-LABEL: test_16xfloat_unpack_low_mask3:
12498; GENERIC:       # %bb.0:
12499; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12500; GENERIC-NEXT:    retq # sched: [1:1.00]
12501;
12502; SKX-LABEL: test_16xfloat_unpack_low_mask3:
12503; SKX:       # %bb.0:
12504; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12505; SKX-NEXT:    retq # sched: [7:1.00]
12506  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12507  ret <16 x float> %res
12508}
12509define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
12510; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
12511; GENERIC:       # %bb.0:
12512; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
12513; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12514; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
12515; GENERIC-NEXT:    retq # sched: [1:1.00]
12516;
12517; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
12518; SKX:       # %bb.0:
12519; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
12520; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12521; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
12522; SKX-NEXT:    retq # sched: [7:1.00]
12523  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12524  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12525  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12526  ret <16 x float> %res
12527}
12528
12529define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
12530; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
12531; GENERIC:       # %bb.0:
12532; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12533; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12534; GENERIC-NEXT:    retq # sched: [1:1.00]
12535;
12536; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
12537; SKX:       # %bb.0:
12538; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12539; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
12540; SKX-NEXT:    retq # sched: [7:1.00]
12541  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12542  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12543  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12544  ret <16 x float> %res
12545}
12546define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
12547; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask0:
12548; GENERIC:       # %bb.0:
12549; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12550; GENERIC-NEXT:    retq # sched: [1:1.00]
12551;
12552; SKX-LABEL: test_16xfloat_unpack_low_mem_mask0:
12553; SKX:       # %bb.0:
12554; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12555; SKX-NEXT:    retq # sched: [7:1.00]
12556  %vec2 = load <16 x float>, <16 x float>* %vec2p
12557  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12558  ret <16 x float> %res
12559}
12560define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
12561; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
12562; GENERIC:       # %bb.0:
12563; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12564; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12565; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
12566; GENERIC-NEXT:    retq # sched: [1:1.00]
12567;
12568; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
12569; SKX:       # %bb.0:
12570; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12571; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12572; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
12573; SKX-NEXT:    retq # sched: [7:1.00]
12574  %vec2 = load <16 x float>, <16 x float>* %vec2p
12575  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12576  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12577  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12578  ret <16 x float> %res
12579}
12580
12581define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
12582; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
12583; GENERIC:       # %bb.0:
12584; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
12585; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12586; GENERIC-NEXT:    retq # sched: [1:1.00]
12587;
12588; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
12589; SKX:       # %bb.0:
12590; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
12591; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12592; SKX-NEXT:    retq # sched: [7:1.00]
12593  %vec2 = load <16 x float>, <16 x float>* %vec2p
12594  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12595  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12596  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12597  ret <16 x float> %res
12598}
12599
12600define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
12601; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
12602; GENERIC:       # %bb.0:
12603; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12604; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12605; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
12606; GENERIC-NEXT:    retq # sched: [1:1.00]
12607;
12608; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
12609; SKX:       # %bb.0:
12610; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12611; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12612; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
12613; SKX-NEXT:    retq # sched: [7:1.00]
12614  %vec2 = load <16 x float>, <16 x float>* %vec2p
12615  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12616  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12617  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12618  ret <16 x float> %res
12619}
12620
12621define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
12622; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
12623; GENERIC:       # %bb.0:
12624; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
12625; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12626; GENERIC-NEXT:    retq # sched: [1:1.00]
12627;
12628; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
12629; SKX:       # %bb.0:
12630; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
12631; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12632; SKX-NEXT:    retq # sched: [7:1.00]
12633  %vec2 = load <16 x float>, <16 x float>* %vec2p
12634  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12635  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12636  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12637  ret <16 x float> %res
12638}
12639
12640define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
12641; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
12642; GENERIC:       # %bb.0:
12643; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12644; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12645; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
12646; GENERIC-NEXT:    retq # sched: [1:1.00]
12647;
12648; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
12649; SKX:       # %bb.0:
12650; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12651; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12652; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
12653; SKX-NEXT:    retq # sched: [7:1.00]
12654  %vec2 = load <16 x float>, <16 x float>* %vec2p
12655  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12656  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12657  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12658  ret <16 x float> %res
12659}
12660
12661define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
12662; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
12663; GENERIC:       # %bb.0:
12664; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
12665; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12666; GENERIC-NEXT:    retq # sched: [1:1.00]
12667;
12668; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
12669; SKX:       # %bb.0:
12670; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
12671; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12672; SKX-NEXT:    retq # sched: [7:1.00]
12673  %vec2 = load <16 x float>, <16 x float>* %vec2p
12674  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12675  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12676  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12677  ret <16 x float> %res
12678}
12679
12680define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
12681; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask3:
12682; GENERIC:       # %bb.0:
12683; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12684; GENERIC-NEXT:    retq # sched: [1:1.00]
12685;
12686; SKX-LABEL: test_16xfloat_unpack_low_mem_mask3:
12687; SKX:       # %bb.0:
12688; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12689; SKX-NEXT:    retq # sched: [7:1.00]
12690  %vec2 = load <16 x float>, <16 x float>* %vec2p
12691  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12692  ret <16 x float> %res
12693}
12694define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
12695; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
12696; GENERIC:       # %bb.0:
12697; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
12698; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12699; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
12700; GENERIC-NEXT:    retq # sched: [1:1.00]
12701;
12702; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
12703; SKX:       # %bb.0:
12704; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
12705; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12706; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
12707; SKX-NEXT:    retq # sched: [7:1.00]
12708  %vec2 = load <16 x float>, <16 x float>* %vec2p
12709  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12710  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12711  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
12712  ret <16 x float> %res
12713}
12714
12715define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
12716; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
12717; GENERIC:       # %bb.0:
12718; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
12719; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12720; GENERIC-NEXT:    retq # sched: [1:1.00]
12721;
12722; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
12723; SKX:       # %bb.0:
12724; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
12725; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
12726; SKX-NEXT:    retq # sched: [7:1.00]
12727  %vec2 = load <16 x float>, <16 x float>* %vec2p
12728  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
12729  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
12730  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
12731  ret <16 x float> %res
12732}
12733
12734define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) {
12735; GENERIC-LABEL: test_2xdouble_unpack_low_mask0:
12736; GENERIC:       # %bb.0:
12737; GENERIC-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
12738; GENERIC-NEXT:    retq # sched: [1:1.00]
12739;
12740; SKX-LABEL: test_2xdouble_unpack_low_mask0:
12741; SKX:       # %bb.0:
12742; SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
12743; SKX-NEXT:    retq # sched: [7:1.00]
12744  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12745  ret <2 x double> %res
12746}
12747define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
12748; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0:
12749; GENERIC:       # %bb.0:
12750; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
12751; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
12752; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
12753; GENERIC-NEXT:    retq # sched: [1:1.00]
12754;
12755; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0:
12756; SKX:       # %bb.0:
12757; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
12758; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
12759; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
12760; SKX-NEXT:    retq # sched: [7:1.00]
12761  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12762  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12763  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
12764  ret <2 x double> %res
12765}
12766
12767define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
12768; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
12769; GENERIC:       # %bb.0:
12770; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
12771; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
12772; GENERIC-NEXT:    retq # sched: [1:1.00]
12773;
12774; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
12775; SKX:       # %bb.0:
12776; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
12777; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
12778; SKX-NEXT:    retq # sched: [7:1.00]
12779  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12780  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12781  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
12782  ret <2 x double> %res
12783}
12784define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
12785; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1:
12786; GENERIC:       # %bb.0:
12787; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
12788; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
12789; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
12790; GENERIC-NEXT:    retq # sched: [1:1.00]
12791;
12792; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1:
12793; SKX:       # %bb.0:
12794; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
12795; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
12796; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
12797; SKX-NEXT:    retq # sched: [7:1.00]
12798  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12799  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12800  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
12801  ret <2 x double> %res
12802}
12803
12804define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
12805; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
12806; GENERIC:       # %bb.0:
12807; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
12808; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
12809; GENERIC-NEXT:    retq # sched: [1:1.00]
12810;
12811; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
12812; SKX:       # %bb.0:
12813; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
12814; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
12815; SKX-NEXT:    retq # sched: [7:1.00]
12816  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12817  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12818  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
12819  ret <2 x double> %res
12820}
12821define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
12822; GENERIC-LABEL: test_2xdouble_unpack_low_mem_mask0:
12823; GENERIC:       # %bb.0:
12824; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
12825; GENERIC-NEXT:    retq # sched: [1:1.00]
12826;
12827; SKX-LABEL: test_2xdouble_unpack_low_mem_mask0:
12828; SKX:       # %bb.0:
12829; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
12830; SKX-NEXT:    retq # sched: [7:1.00]
12831  %vec2 = load <2 x double>, <2 x double>* %vec2p
12832  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12833  ret <2 x double> %res
12834}
12835define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
12836; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
12837; GENERIC:       # %bb.0:
12838; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
12839; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
12840; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
12841; GENERIC-NEXT:    retq # sched: [1:1.00]
12842;
12843; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
12844; SKX:       # %bb.0:
12845; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
12846; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
12847; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
12848; SKX-NEXT:    retq # sched: [7:1.00]
12849  %vec2 = load <2 x double>, <2 x double>* %vec2p
12850  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12851  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12852  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
12853  ret <2 x double> %res
12854}
12855
12856define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
12857; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
12858; GENERIC:       # %bb.0:
12859; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
12860; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
12861; GENERIC-NEXT:    retq # sched: [1:1.00]
12862;
12863; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
12864; SKX:       # %bb.0:
12865; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
12866; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
12867; SKX-NEXT:    retq # sched: [7:1.00]
12868  %vec2 = load <2 x double>, <2 x double>* %vec2p
12869  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12870  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12871  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
12872  ret <2 x double> %res
12873}
12874
12875define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
12876; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
12877; GENERIC:       # %bb.0:
12878; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
12879; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
12880; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
12881; GENERIC-NEXT:    retq # sched: [1:1.00]
12882;
12883; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
12884; SKX:       # %bb.0:
12885; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
12886; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
12887; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
12888; SKX-NEXT:    retq # sched: [7:1.00]
12889  %vec2 = load <2 x double>, <2 x double>* %vec2p
12890  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12891  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12892  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
12893  ret <2 x double> %res
12894}
12895
12896define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
12897; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
12898; GENERIC:       # %bb.0:
12899; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
12900; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
12901; GENERIC-NEXT:    retq # sched: [1:1.00]
12902;
12903; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
12904; SKX:       # %bb.0:
12905; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
12906; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
12907; SKX-NEXT:    retq # sched: [7:1.00]
12908  %vec2 = load <2 x double>, <2 x double>* %vec2p
12909  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
12910  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
12911  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
12912  ret <2 x double> %res
12913}
12914
12915define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) {
12916; GENERIC-LABEL: test_4xdouble_unpack_low_mask0:
12917; GENERIC:       # %bb.0:
12918; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12919; GENERIC-NEXT:    retq # sched: [1:1.00]
12920;
12921; SKX-LABEL: test_4xdouble_unpack_low_mask0:
12922; SKX:       # %bb.0:
12923; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12924; SKX-NEXT:    retq # sched: [7:1.00]
12925  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
12926  ret <4 x double> %res
12927}
12928define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
12929; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0:
12930; GENERIC:       # %bb.0:
12931; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
12932; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12933; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
12934; GENERIC-NEXT:    retq # sched: [1:1.00]
12935;
12936; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0:
12937; SKX:       # %bb.0:
12938; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
12939; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12940; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
12941; SKX-NEXT:    retq # sched: [7:1.00]
12942  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
12943  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
12944  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
12945  ret <4 x double> %res
12946}
12947
12948define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
12949; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
12950; GENERIC:       # %bb.0:
12951; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
12952; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12953; GENERIC-NEXT:    retq # sched: [1:1.00]
12954;
12955; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
12956; SKX:       # %bb.0:
12957; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
12958; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12959; SKX-NEXT:    retq # sched: [7:1.00]
12960  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
12961  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
12962  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
12963  ret <4 x double> %res
12964}
12965define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
12966; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1:
12967; GENERIC:       # %bb.0:
12968; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
12969; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12970; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
12971; GENERIC-NEXT:    retq # sched: [1:1.00]
12972;
12973; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1:
12974; SKX:       # %bb.0:
12975; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
12976; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12977; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
12978; SKX-NEXT:    retq # sched: [7:1.00]
12979  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
12980  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
12981  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
12982  ret <4 x double> %res
12983}
12984
12985define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
12986; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
12987; GENERIC:       # %bb.0:
12988; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
12989; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12990; GENERIC-NEXT:    retq # sched: [1:1.00]
12991;
12992; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
12993; SKX:       # %bb.0:
12994; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
12995; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
12996; SKX-NEXT:    retq # sched: [7:1.00]
12997  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
12998  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
12999  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
13000  ret <4 x double> %res
13001}
13002define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
13003; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2:
13004; GENERIC:       # %bb.0:
13005; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
13006; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13007; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
13008; GENERIC-NEXT:    retq # sched: [1:1.00]
13009;
13010; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2:
13011; SKX:       # %bb.0:
13012; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
13013; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13014; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
13015; SKX-NEXT:    retq # sched: [7:1.00]
13016  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13017  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13018  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
13019  ret <4 x double> %res
13020}
13021
13022define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
13023; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
13024; GENERIC:       # %bb.0:
13025; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
13026; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13027; GENERIC-NEXT:    retq # sched: [1:1.00]
13028;
13029; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
13030; SKX:       # %bb.0:
13031; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
13032; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13033; SKX-NEXT:    retq # sched: [7:1.00]
13034  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13035  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13036  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
13037  ret <4 x double> %res
13038}
13039define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) {
13040; GENERIC-LABEL: test_4xdouble_unpack_low_mask3:
13041; GENERIC:       # %bb.0:
13042; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13043; GENERIC-NEXT:    retq # sched: [1:1.00]
13044;
13045; SKX-LABEL: test_4xdouble_unpack_low_mask3:
13046; SKX:       # %bb.0:
13047; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13048; SKX-NEXT:    retq # sched: [7:1.00]
13049  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13050  ret <4 x double> %res
13051}
13052define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
13053; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3:
13054; GENERIC:       # %bb.0:
13055; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
13056; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13057; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
13058; GENERIC-NEXT:    retq # sched: [1:1.00]
13059;
13060; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3:
13061; SKX:       # %bb.0:
13062; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
13063; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13064; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
13065; SKX-NEXT:    retq # sched: [7:1.00]
13066  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13067  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13068  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
13069  ret <4 x double> %res
13070}
13071
13072define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
13073; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
13074; GENERIC:       # %bb.0:
13075; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
13076; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13077; GENERIC-NEXT:    retq # sched: [1:1.00]
13078;
13079; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
13080; SKX:       # %bb.0:
13081; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
13082; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
13083; SKX-NEXT:    retq # sched: [7:1.00]
13084  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13085  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13086  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
13087  ret <4 x double> %res
13088}
13089define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
13090; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask0:
13091; GENERIC:       # %bb.0:
13092; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13093; GENERIC-NEXT:    retq # sched: [1:1.00]
13094;
13095; SKX-LABEL: test_4xdouble_unpack_low_mem_mask0:
13096; SKX:       # %bb.0:
13097; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13098; SKX-NEXT:    retq # sched: [7:1.00]
13099  %vec2 = load <4 x double>, <4 x double>* %vec2p
13100  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13101  ret <4 x double> %res
13102}
13103define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
13104; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
13105; GENERIC:       # %bb.0:
13106; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
13107; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13108; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
13109; GENERIC-NEXT:    retq # sched: [1:1.00]
13110;
13111; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
13112; SKX:       # %bb.0:
13113; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
13114; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13115; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
13116; SKX-NEXT:    retq # sched: [7:1.00]
13117  %vec2 = load <4 x double>, <4 x double>* %vec2p
13118  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13119  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13120  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
13121  ret <4 x double> %res
13122}
13123
13124define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
13125; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
13126; GENERIC:       # %bb.0:
13127; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
13128; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13129; GENERIC-NEXT:    retq # sched: [1:1.00]
13130;
13131; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
13132; SKX:       # %bb.0:
13133; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
13134; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13135; SKX-NEXT:    retq # sched: [7:1.00]
13136  %vec2 = load <4 x double>, <4 x double>* %vec2p
13137  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13138  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13139  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
13140  ret <4 x double> %res
13141}
13142
13143define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
13144; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
13145; GENERIC:       # %bb.0:
13146; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
13147; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13148; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
13149; GENERIC-NEXT:    retq # sched: [1:1.00]
13150;
13151; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
13152; SKX:       # %bb.0:
13153; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
13154; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13155; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
13156; SKX-NEXT:    retq # sched: [7:1.00]
13157  %vec2 = load <4 x double>, <4 x double>* %vec2p
13158  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13159  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13160  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
13161  ret <4 x double> %res
13162}
13163
13164define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
13165; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
13166; GENERIC:       # %bb.0:
13167; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
13168; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13169; GENERIC-NEXT:    retq # sched: [1:1.00]
13170;
13171; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
13172; SKX:       # %bb.0:
13173; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
13174; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13175; SKX-NEXT:    retq # sched: [7:1.00]
13176  %vec2 = load <4 x double>, <4 x double>* %vec2p
13177  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13178  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13179  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
13180  ret <4 x double> %res
13181}
13182
13183define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
13184; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
13185; GENERIC:       # %bb.0:
13186; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
13187; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13188; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
13189; GENERIC-NEXT:    retq # sched: [1:1.00]
13190;
13191; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
13192; SKX:       # %bb.0:
13193; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
13194; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13195; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
13196; SKX-NEXT:    retq # sched: [7:1.00]
13197  %vec2 = load <4 x double>, <4 x double>* %vec2p
13198  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13199  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13200  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
13201  ret <4 x double> %res
13202}
13203
13204define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
13205; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
13206; GENERIC:       # %bb.0:
13207; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
13208; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13209; GENERIC-NEXT:    retq # sched: [1:1.00]
13210;
13211; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
13212; SKX:       # %bb.0:
13213; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
13214; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13215; SKX-NEXT:    retq # sched: [7:1.00]
13216  %vec2 = load <4 x double>, <4 x double>* %vec2p
13217  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13218  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13219  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
13220  ret <4 x double> %res
13221}
13222
13223define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
13224; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask3:
13225; GENERIC:       # %bb.0:
13226; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13227; GENERIC-NEXT:    retq # sched: [1:1.00]
13228;
13229; SKX-LABEL: test_4xdouble_unpack_low_mem_mask3:
13230; SKX:       # %bb.0:
13231; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13232; SKX-NEXT:    retq # sched: [7:1.00]
13233  %vec2 = load <4 x double>, <4 x double>* %vec2p
13234  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13235  ret <4 x double> %res
13236}
13237define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
13238; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
13239; GENERIC:       # %bb.0:
13240; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
13241; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13242; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
13243; GENERIC-NEXT:    retq # sched: [1:1.00]
13244;
13245; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
13246; SKX:       # %bb.0:
13247; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
13248; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13249; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
13250; SKX-NEXT:    retq # sched: [7:1.00]
13251  %vec2 = load <4 x double>, <4 x double>* %vec2p
13252  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13253  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13254  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
13255  ret <4 x double> %res
13256}
13257
13258define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
13259; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
13260; GENERIC:       # %bb.0:
13261; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
13262; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13263; GENERIC-NEXT:    retq # sched: [1:1.00]
13264;
13265; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
13266; SKX:       # %bb.0:
13267; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
13268; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
13269; SKX-NEXT:    retq # sched: [7:1.00]
13270  %vec2 = load <4 x double>, <4 x double>* %vec2p
13271  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
13272  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
13273  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
13274  ret <4 x double> %res
13275}
13276
13277define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
13278; GENERIC-LABEL: test_8xdouble_unpack_low_mask0:
13279; GENERIC:       # %bb.0:
13280; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13281; GENERIC-NEXT:    retq # sched: [1:1.00]
13282;
13283; SKX-LABEL: test_8xdouble_unpack_low_mask0:
13284; SKX:       # %bb.0:
13285; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13286; SKX-NEXT:    retq # sched: [7:1.00]
13287  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13288  ret <8 x double> %res
13289}
13290define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
13291; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0:
13292; GENERIC:       # %bb.0:
13293; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
13294; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13295; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
13296; GENERIC-NEXT:    retq # sched: [1:1.00]
13297;
13298; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
13299; SKX:       # %bb.0:
13300; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
13301; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13302; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
13303; SKX-NEXT:    retq # sched: [7:1.00]
13304  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13305  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13306  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13307  ret <8 x double> %res
13308}
13309
13310define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
13311; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
13312; GENERIC:       # %bb.0:
13313; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13314; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13315; GENERIC-NEXT:    retq # sched: [1:1.00]
13316;
13317; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
13318; SKX:       # %bb.0:
13319; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13320; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13321; SKX-NEXT:    retq # sched: [7:1.00]
13322  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13323  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13324  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13325  ret <8 x double> %res
13326}
13327define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
13328; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1:
13329; GENERIC:       # %bb.0:
13330; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
13331; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13332; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
13333; GENERIC-NEXT:    retq # sched: [1:1.00]
13334;
13335; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
13336; SKX:       # %bb.0:
13337; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
13338; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13339; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
13340; SKX-NEXT:    retq # sched: [7:1.00]
13341  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13342  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13343  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13344  ret <8 x double> %res
13345}
13346
13347define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
13348; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
13349; GENERIC:       # %bb.0:
13350; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13351; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13352; GENERIC-NEXT:    retq # sched: [1:1.00]
13353;
13354; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
13355; SKX:       # %bb.0:
13356; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13357; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13358; SKX-NEXT:    retq # sched: [7:1.00]
13359  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13360  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13361  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13362  ret <8 x double> %res
13363}
13364define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
13365; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2:
13366; GENERIC:       # %bb.0:
13367; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
13368; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13369; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
13370; GENERIC-NEXT:    retq # sched: [1:1.00]
13371;
13372; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
13373; SKX:       # %bb.0:
13374; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
13375; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13376; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
13377; SKX-NEXT:    retq # sched: [7:1.00]
13378  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13379  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13380  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13381  ret <8 x double> %res
13382}
13383
13384define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
13385; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
13386; GENERIC:       # %bb.0:
13387; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13388; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13389; GENERIC-NEXT:    retq # sched: [1:1.00]
13390;
13391; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
13392; SKX:       # %bb.0:
13393; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13394; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13395; SKX-NEXT:    retq # sched: [7:1.00]
13396  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13397  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13398  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13399  ret <8 x double> %res
13400}
13401define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
13402; GENERIC-LABEL: test_8xdouble_unpack_low_mask3:
13403; GENERIC:       # %bb.0:
13404; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13405; GENERIC-NEXT:    retq # sched: [1:1.00]
13406;
13407; SKX-LABEL: test_8xdouble_unpack_low_mask3:
13408; SKX:       # %bb.0:
13409; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13410; SKX-NEXT:    retq # sched: [7:1.00]
13411  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13412  ret <8 x double> %res
13413}
13414define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
13415; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3:
13416; GENERIC:       # %bb.0:
13417; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
13418; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13419; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
13420; GENERIC-NEXT:    retq # sched: [1:1.00]
13421;
13422; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
13423; SKX:       # %bb.0:
13424; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
13425; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13426; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
13427; SKX-NEXT:    retq # sched: [7:1.00]
13428  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13429  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13430  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13431  ret <8 x double> %res
13432}
13433
13434define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
13435; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
13436; GENERIC:       # %bb.0:
13437; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13438; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13439; GENERIC-NEXT:    retq # sched: [1:1.00]
13440;
13441; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
13442; SKX:       # %bb.0:
13443; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13444; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
13445; SKX-NEXT:    retq # sched: [7:1.00]
13446  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13447  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13448  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13449  ret <8 x double> %res
13450}
13451define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
13452; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask0:
13453; GENERIC:       # %bb.0:
13454; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13455; GENERIC-NEXT:    retq # sched: [1:1.00]
13456;
13457; SKX-LABEL: test_8xdouble_unpack_low_mem_mask0:
13458; SKX:       # %bb.0:
13459; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13460; SKX-NEXT:    retq # sched: [7:1.00]
13461  %vec2 = load <8 x double>, <8 x double>* %vec2p
13462  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13463  ret <8 x double> %res
13464}
13465define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
13466; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
13467; GENERIC:       # %bb.0:
13468; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13469; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13470; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
13471; GENERIC-NEXT:    retq # sched: [1:1.00]
13472;
13473; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
13474; SKX:       # %bb.0:
13475; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13476; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13477; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
13478; SKX-NEXT:    retq # sched: [7:1.00]
13479  %vec2 = load <8 x double>, <8 x double>* %vec2p
13480  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13481  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13482  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13483  ret <8 x double> %res
13484}
13485
13486define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
13487; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
13488; GENERIC:       # %bb.0:
13489; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
13490; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13491; GENERIC-NEXT:    retq # sched: [1:1.00]
13492;
13493; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
13494; SKX:       # %bb.0:
13495; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
13496; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13497; SKX-NEXT:    retq # sched: [7:1.00]
13498  %vec2 = load <8 x double>, <8 x double>* %vec2p
13499  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13500  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13501  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13502  ret <8 x double> %res
13503}
13504
13505define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
13506; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
13507; GENERIC:       # %bb.0:
13508; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13509; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13510; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
13511; GENERIC-NEXT:    retq # sched: [1:1.00]
13512;
13513; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
13514; SKX:       # %bb.0:
13515; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13516; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13517; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
13518; SKX-NEXT:    retq # sched: [7:1.00]
13519  %vec2 = load <8 x double>, <8 x double>* %vec2p
13520  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13521  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13522  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13523  ret <8 x double> %res
13524}
13525
13526define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
13527; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
13528; GENERIC:       # %bb.0:
13529; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
13530; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13531; GENERIC-NEXT:    retq # sched: [1:1.00]
13532;
13533; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
13534; SKX:       # %bb.0:
13535; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
13536; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13537; SKX-NEXT:    retq # sched: [7:1.00]
13538  %vec2 = load <8 x double>, <8 x double>* %vec2p
13539  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13540  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13541  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13542  ret <8 x double> %res
13543}
13544
13545define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
13546; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
13547; GENERIC:       # %bb.0:
13548; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13549; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13550; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
13551; GENERIC-NEXT:    retq # sched: [1:1.00]
13552;
13553; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
13554; SKX:       # %bb.0:
13555; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13556; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13557; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
13558; SKX-NEXT:    retq # sched: [7:1.00]
13559  %vec2 = load <8 x double>, <8 x double>* %vec2p
13560  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13561  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13562  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13563  ret <8 x double> %res
13564}
13565
13566define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
13567; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
13568; GENERIC:       # %bb.0:
13569; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
13570; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13571; GENERIC-NEXT:    retq # sched: [1:1.00]
13572;
13573; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
13574; SKX:       # %bb.0:
13575; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
13576; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13577; SKX-NEXT:    retq # sched: [7:1.00]
13578  %vec2 = load <8 x double>, <8 x double>* %vec2p
13579  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13580  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13581  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13582  ret <8 x double> %res
13583}
13584
13585define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
13586; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask3:
13587; GENERIC:       # %bb.0:
13588; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13589; GENERIC-NEXT:    retq # sched: [1:1.00]
13590;
13591; SKX-LABEL: test_8xdouble_unpack_low_mem_mask3:
13592; SKX:       # %bb.0:
13593; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13594; SKX-NEXT:    retq # sched: [7:1.00]
13595  %vec2 = load <8 x double>, <8 x double>* %vec2p
13596  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13597  ret <8 x double> %res
13598}
13599define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
13600; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
13601; GENERIC:       # %bb.0:
13602; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
13603; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13604; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
13605; GENERIC-NEXT:    retq # sched: [1:1.00]
13606;
13607; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
13608; SKX:       # %bb.0:
13609; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
13610; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13611; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
13612; SKX-NEXT:    retq # sched: [7:1.00]
13613  %vec2 = load <8 x double>, <8 x double>* %vec2p
13614  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13615  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13616  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
13617  ret <8 x double> %res
13618}
13619
13620define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
13621; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
13622; GENERIC:       # %bb.0:
13623; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
13624; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13625; GENERIC-NEXT:    retq # sched: [1:1.00]
13626;
13627; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
13628; SKX:       # %bb.0:
13629; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
13630; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
13631; SKX-NEXT:    retq # sched: [7:1.00]
13632  %vec2 = load <8 x double>, <8 x double>* %vec2p
13633  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
13634  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
13635  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
13636  ret <8 x double> %res
13637}
13638
13639define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) {
13640; GENERIC-LABEL: test_4xfloat_unpack_high_mask0:
13641; GENERIC:       # %bb.0:
13642; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13643; GENERIC-NEXT:    retq # sched: [1:1.00]
13644;
13645; SKX-LABEL: test_4xfloat_unpack_high_mask0:
13646; SKX:       # %bb.0:
13647; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13648; SKX-NEXT:    retq # sched: [7:1.00]
13649  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13650  ret <4 x float> %res
13651}
13652define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
13653; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0:
13654; GENERIC:       # %bb.0:
13655; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
13656; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13657; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
13658; GENERIC-NEXT:    retq # sched: [1:1.00]
13659;
13660; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0:
13661; SKX:       # %bb.0:
13662; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
13663; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13664; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
13665; SKX-NEXT:    retq # sched: [7:1.00]
13666  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13667  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13668  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13669  ret <4 x float> %res
13670}
13671
13672define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
13673; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
13674; GENERIC:       # %bb.0:
13675; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13676; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13677; GENERIC-NEXT:    retq # sched: [1:1.00]
13678;
13679; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
13680; SKX:       # %bb.0:
13681; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13682; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13683; SKX-NEXT:    retq # sched: [7:1.00]
13684  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13685  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13686  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13687  ret <4 x float> %res
13688}
13689define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
13690; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1:
13691; GENERIC:       # %bb.0:
13692; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
13693; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13694; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
13695; GENERIC-NEXT:    retq # sched: [1:1.00]
13696;
13697; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1:
13698; SKX:       # %bb.0:
13699; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
13700; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13701; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
13702; SKX-NEXT:    retq # sched: [7:1.00]
13703  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13704  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13705  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13706  ret <4 x float> %res
13707}
13708
13709define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
13710; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
13711; GENERIC:       # %bb.0:
13712; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13713; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13714; GENERIC-NEXT:    retq # sched: [1:1.00]
13715;
13716; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
13717; SKX:       # %bb.0:
13718; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13719; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13720; SKX-NEXT:    retq # sched: [7:1.00]
13721  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13722  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13723  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13724  ret <4 x float> %res
13725}
13726define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
13727; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2:
13728; GENERIC:       # %bb.0:
13729; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
13730; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13731; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
13732; GENERIC-NEXT:    retq # sched: [1:1.00]
13733;
13734; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2:
13735; SKX:       # %bb.0:
13736; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
13737; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13738; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
13739; SKX-NEXT:    retq # sched: [7:1.00]
13740  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13741  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13742  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13743  ret <4 x float> %res
13744}
13745
13746define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
13747; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
13748; GENERIC:       # %bb.0:
13749; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13750; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13751; GENERIC-NEXT:    retq # sched: [1:1.00]
13752;
13753; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
13754; SKX:       # %bb.0:
13755; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13756; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13757; SKX-NEXT:    retq # sched: [7:1.00]
13758  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13759  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13760  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13761  ret <4 x float> %res
13762}
13763define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) {
13764; GENERIC-LABEL: test_4xfloat_unpack_high_mask3:
13765; GENERIC:       # %bb.0:
13766; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13767; GENERIC-NEXT:    retq # sched: [1:1.00]
13768;
13769; SKX-LABEL: test_4xfloat_unpack_high_mask3:
13770; SKX:       # %bb.0:
13771; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13772; SKX-NEXT:    retq # sched: [7:1.00]
13773  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13774  ret <4 x float> %res
13775}
13776define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
13777; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3:
13778; GENERIC:       # %bb.0:
13779; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
13780; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13781; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
13782; GENERIC-NEXT:    retq # sched: [1:1.00]
13783;
13784; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3:
13785; SKX:       # %bb.0:
13786; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
13787; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13788; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
13789; SKX-NEXT:    retq # sched: [7:1.00]
13790  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13791  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13792  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13793  ret <4 x float> %res
13794}
13795
13796define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
13797; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
13798; GENERIC:       # %bb.0:
13799; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13800; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13801; GENERIC-NEXT:    retq # sched: [1:1.00]
13802;
13803; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
13804; SKX:       # %bb.0:
13805; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13806; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
13807; SKX-NEXT:    retq # sched: [7:1.00]
13808  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13809  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13810  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13811  ret <4 x float> %res
13812}
13813define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
13814; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask0:
13815; GENERIC:       # %bb.0:
13816; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13817; GENERIC-NEXT:    retq # sched: [1:1.00]
13818;
13819; SKX-LABEL: test_4xfloat_unpack_high_mem_mask0:
13820; SKX:       # %bb.0:
13821; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13822; SKX-NEXT:    retq # sched: [7:1.00]
13823  %vec2 = load <4 x float>, <4 x float>* %vec2p
13824  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13825  ret <4 x float> %res
13826}
13827define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
13828; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
13829; GENERIC:       # %bb.0:
13830; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13831; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13832; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
13833; GENERIC-NEXT:    retq # sched: [1:1.00]
13834;
13835; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
13836; SKX:       # %bb.0:
13837; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13838; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13839; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
13840; SKX-NEXT:    retq # sched: [7:1.00]
13841  %vec2 = load <4 x float>, <4 x float>* %vec2p
13842  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13843  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13844  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13845  ret <4 x float> %res
13846}
13847
13848define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
13849; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
13850; GENERIC:       # %bb.0:
13851; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
13852; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13853; GENERIC-NEXT:    retq # sched: [1:1.00]
13854;
13855; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
13856; SKX:       # %bb.0:
13857; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
13858; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13859; SKX-NEXT:    retq # sched: [7:1.00]
13860  %vec2 = load <4 x float>, <4 x float>* %vec2p
13861  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13862  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13863  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13864  ret <4 x float> %res
13865}
13866
13867define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
13868; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
13869; GENERIC:       # %bb.0:
13870; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13871; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13872; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
13873; GENERIC-NEXT:    retq # sched: [1:1.00]
13874;
13875; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
13876; SKX:       # %bb.0:
13877; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13878; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13879; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
13880; SKX-NEXT:    retq # sched: [7:1.00]
13881  %vec2 = load <4 x float>, <4 x float>* %vec2p
13882  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13883  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13884  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13885  ret <4 x float> %res
13886}
13887
13888define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
13889; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
13890; GENERIC:       # %bb.0:
13891; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
13892; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13893; GENERIC-NEXT:    retq # sched: [1:1.00]
13894;
13895; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
13896; SKX:       # %bb.0:
13897; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
13898; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13899; SKX-NEXT:    retq # sched: [7:1.00]
13900  %vec2 = load <4 x float>, <4 x float>* %vec2p
13901  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13902  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13903  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13904  ret <4 x float> %res
13905}
13906
13907define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
13908; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
13909; GENERIC:       # %bb.0:
13910; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13911; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13912; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
13913; GENERIC-NEXT:    retq # sched: [1:1.00]
13914;
13915; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
13916; SKX:       # %bb.0:
13917; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13918; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13919; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
13920; SKX-NEXT:    retq # sched: [7:1.00]
13921  %vec2 = load <4 x float>, <4 x float>* %vec2p
13922  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13923  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13924  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13925  ret <4 x float> %res
13926}
13927
13928define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
13929; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
13930; GENERIC:       # %bb.0:
13931; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
13932; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13933; GENERIC-NEXT:    retq # sched: [1:1.00]
13934;
13935; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
13936; SKX:       # %bb.0:
13937; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
13938; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13939; SKX-NEXT:    retq # sched: [7:1.00]
13940  %vec2 = load <4 x float>, <4 x float>* %vec2p
13941  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13942  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13943  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13944  ret <4 x float> %res
13945}
13946
13947define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
13948; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask3:
13949; GENERIC:       # %bb.0:
13950; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13951; GENERIC-NEXT:    retq # sched: [1:1.00]
13952;
13953; SKX-LABEL: test_4xfloat_unpack_high_mem_mask3:
13954; SKX:       # %bb.0:
13955; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13956; SKX-NEXT:    retq # sched: [7:1.00]
13957  %vec2 = load <4 x float>, <4 x float>* %vec2p
13958  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13959  ret <4 x float> %res
13960}
13961define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
13962; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
13963; GENERIC:       # %bb.0:
13964; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
13965; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13966; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
13967; GENERIC-NEXT:    retq # sched: [1:1.00]
13968;
13969; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
13970; SKX:       # %bb.0:
13971; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
13972; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13973; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
13974; SKX-NEXT:    retq # sched: [7:1.00]
13975  %vec2 = load <4 x float>, <4 x float>* %vec2p
13976  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13977  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13978  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
13979  ret <4 x float> %res
13980}
13981
13982define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
13983; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
13984; GENERIC:       # %bb.0:
13985; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
13986; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13987; GENERIC-NEXT:    retq # sched: [1:1.00]
13988;
13989; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
13990; SKX:       # %bb.0:
13991; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
13992; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
13993; SKX-NEXT:    retq # sched: [7:1.00]
13994  %vec2 = load <4 x float>, <4 x float>* %vec2p
13995  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
13996  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
13997  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
13998  ret <4 x float> %res
13999}
14000
14001define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) {
14002; GENERIC-LABEL: test_8xfloat_unpack_high_mask0:
14003; GENERIC:       # %bb.0:
14004; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14005; GENERIC-NEXT:    retq # sched: [1:1.00]
14006;
14007; SKX-LABEL: test_8xfloat_unpack_high_mask0:
14008; SKX:       # %bb.0:
14009; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14010; SKX-NEXT:    retq # sched: [7:1.00]
14011  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14012  ret <8 x float> %res
14013}
14014define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
14015; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0:
14016; GENERIC:       # %bb.0:
14017; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
14018; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14019; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
14020; GENERIC-NEXT:    retq # sched: [1:1.00]
14021;
14022; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0:
14023; SKX:       # %bb.0:
14024; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
14025; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14026; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
14027; SKX-NEXT:    retq # sched: [7:1.00]
14028  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14029  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14030  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14031  ret <8 x float> %res
14032}
14033
14034define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
14035; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
14036; GENERIC:       # %bb.0:
14037; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14038; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14039; GENERIC-NEXT:    retq # sched: [1:1.00]
14040;
14041; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
14042; SKX:       # %bb.0:
14043; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14044; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14045; SKX-NEXT:    retq # sched: [7:1.00]
14046  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14047  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14048  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14049  ret <8 x float> %res
14050}
14051define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
14052; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1:
14053; GENERIC:       # %bb.0:
14054; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
14055; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14056; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
14057; GENERIC-NEXT:    retq # sched: [1:1.00]
14058;
14059; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1:
14060; SKX:       # %bb.0:
14061; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
14062; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14063; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
14064; SKX-NEXT:    retq # sched: [7:1.00]
14065  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14066  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14067  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14068  ret <8 x float> %res
14069}
14070
14071define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
14072; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
14073; GENERIC:       # %bb.0:
14074; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14075; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14076; GENERIC-NEXT:    retq # sched: [1:1.00]
14077;
14078; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
14079; SKX:       # %bb.0:
14080; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14081; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14082; SKX-NEXT:    retq # sched: [7:1.00]
14083  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14084  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14085  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14086  ret <8 x float> %res
14087}
14088define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
14089; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2:
14090; GENERIC:       # %bb.0:
14091; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
14092; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14093; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
14094; GENERIC-NEXT:    retq # sched: [1:1.00]
14095;
14096; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2:
14097; SKX:       # %bb.0:
14098; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
14099; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14100; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
14101; SKX-NEXT:    retq # sched: [7:1.00]
14102  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14103  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14104  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14105  ret <8 x float> %res
14106}
14107
14108define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
14109; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
14110; GENERIC:       # %bb.0:
14111; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14112; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14113; GENERIC-NEXT:    retq # sched: [1:1.00]
14114;
14115; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
14116; SKX:       # %bb.0:
14117; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14118; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14119; SKX-NEXT:    retq # sched: [7:1.00]
14120  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14121  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14122  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14123  ret <8 x float> %res
14124}
14125define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) {
14126; GENERIC-LABEL: test_8xfloat_unpack_high_mask3:
14127; GENERIC:       # %bb.0:
14128; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14129; GENERIC-NEXT:    retq # sched: [1:1.00]
14130;
14131; SKX-LABEL: test_8xfloat_unpack_high_mask3:
14132; SKX:       # %bb.0:
14133; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14134; SKX-NEXT:    retq # sched: [7:1.00]
14135  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14136  ret <8 x float> %res
14137}
14138define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
14139; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3:
14140; GENERIC:       # %bb.0:
14141; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
14142; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14143; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
14144; GENERIC-NEXT:    retq # sched: [1:1.00]
14145;
14146; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3:
14147; SKX:       # %bb.0:
14148; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
14149; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14150; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
14151; SKX-NEXT:    retq # sched: [7:1.00]
14152  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14153  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14154  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14155  ret <8 x float> %res
14156}
14157
14158define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
14159; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
14160; GENERIC:       # %bb.0:
14161; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14162; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14163; GENERIC-NEXT:    retq # sched: [1:1.00]
14164;
14165; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
14166; SKX:       # %bb.0:
14167; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14168; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
14169; SKX-NEXT:    retq # sched: [7:1.00]
14170  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14171  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14172  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14173  ret <8 x float> %res
14174}
14175define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
14176; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask0:
14177; GENERIC:       # %bb.0:
14178; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14179; GENERIC-NEXT:    retq # sched: [1:1.00]
14180;
14181; SKX-LABEL: test_8xfloat_unpack_high_mem_mask0:
14182; SKX:       # %bb.0:
14183; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14184; SKX-NEXT:    retq # sched: [7:1.00]
14185  %vec2 = load <8 x float>, <8 x float>* %vec2p
14186  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14187  ret <8 x float> %res
14188}
14189define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
14190; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
14191; GENERIC:       # %bb.0:
14192; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14193; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14194; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
14195; GENERIC-NEXT:    retq # sched: [1:1.00]
14196;
14197; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
14198; SKX:       # %bb.0:
14199; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14200; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14201; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
14202; SKX-NEXT:    retq # sched: [7:1.00]
14203  %vec2 = load <8 x float>, <8 x float>* %vec2p
14204  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14205  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14206  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14207  ret <8 x float> %res
14208}
14209
14210define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
14211; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
14212; GENERIC:       # %bb.0:
14213; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
14214; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14215; GENERIC-NEXT:    retq # sched: [1:1.00]
14216;
14217; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
14218; SKX:       # %bb.0:
14219; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
14220; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14221; SKX-NEXT:    retq # sched: [7:1.00]
14222  %vec2 = load <8 x float>, <8 x float>* %vec2p
14223  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14224  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14225  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14226  ret <8 x float> %res
14227}
14228
14229define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
14230; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
14231; GENERIC:       # %bb.0:
14232; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14233; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14234; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
14235; GENERIC-NEXT:    retq # sched: [1:1.00]
14236;
14237; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
14238; SKX:       # %bb.0:
14239; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14240; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14241; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
14242; SKX-NEXT:    retq # sched: [7:1.00]
14243  %vec2 = load <8 x float>, <8 x float>* %vec2p
14244  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14245  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14246  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14247  ret <8 x float> %res
14248}
14249
14250define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
14251; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
14252; GENERIC:       # %bb.0:
14253; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
14254; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14255; GENERIC-NEXT:    retq # sched: [1:1.00]
14256;
14257; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
14258; SKX:       # %bb.0:
14259; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
14260; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14261; SKX-NEXT:    retq # sched: [7:1.00]
14262  %vec2 = load <8 x float>, <8 x float>* %vec2p
14263  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14264  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14265  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14266  ret <8 x float> %res
14267}
14268
14269define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
14270; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
14271; GENERIC:       # %bb.0:
14272; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14273; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14274; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
14275; GENERIC-NEXT:    retq # sched: [1:1.00]
14276;
14277; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
14278; SKX:       # %bb.0:
14279; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14280; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14281; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
14282; SKX-NEXT:    retq # sched: [7:1.00]
14283  %vec2 = load <8 x float>, <8 x float>* %vec2p
14284  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14285  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14286  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14287  ret <8 x float> %res
14288}
14289
14290define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
14291; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
14292; GENERIC:       # %bb.0:
14293; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
14294; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14295; GENERIC-NEXT:    retq # sched: [1:1.00]
14296;
14297; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
14298; SKX:       # %bb.0:
14299; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
14300; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14301; SKX-NEXT:    retq # sched: [7:1.00]
14302  %vec2 = load <8 x float>, <8 x float>* %vec2p
14303  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14304  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14305  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14306  ret <8 x float> %res
14307}
14308
14309define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
14310; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask3:
14311; GENERIC:       # %bb.0:
14312; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14313; GENERIC-NEXT:    retq # sched: [1:1.00]
14314;
14315; SKX-LABEL: test_8xfloat_unpack_high_mem_mask3:
14316; SKX:       # %bb.0:
14317; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14318; SKX-NEXT:    retq # sched: [7:1.00]
14319  %vec2 = load <8 x float>, <8 x float>* %vec2p
14320  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14321  ret <8 x float> %res
14322}
14323define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
14324; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
14325; GENERIC:       # %bb.0:
14326; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
14327; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14328; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
14329; GENERIC-NEXT:    retq # sched: [1:1.00]
14330;
14331; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
14332; SKX:       # %bb.0:
14333; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
14334; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14335; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
14336; SKX-NEXT:    retq # sched: [7:1.00]
14337  %vec2 = load <8 x float>, <8 x float>* %vec2p
14338  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14339  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14340  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
14341  ret <8 x float> %res
14342}
14343
14344define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
14345; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
14346; GENERIC:       # %bb.0:
14347; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
14348; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14349; GENERIC-NEXT:    retq # sched: [1:1.00]
14350;
14351; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
14352; SKX:       # %bb.0:
14353; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
14354; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
14355; SKX-NEXT:    retq # sched: [7:1.00]
14356  %vec2 = load <8 x float>, <8 x float>* %vec2p
14357  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
14358  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
14359  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
14360  ret <8 x float> %res
14361}
14362
14363define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
14364; GENERIC-LABEL: test_16xfloat_unpack_high_mask0:
14365; GENERIC:       # %bb.0:
14366; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14367; GENERIC-NEXT:    retq # sched: [1:1.00]
14368;
14369; SKX-LABEL: test_16xfloat_unpack_high_mask0:
14370; SKX:       # %bb.0:
14371; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14372; SKX-NEXT:    retq # sched: [7:1.00]
14373  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14374  ret <16 x float> %res
14375}
14376define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
14377; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0:
14378; GENERIC:       # %bb.0:
14379; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
14380; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14381; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
14382; GENERIC-NEXT:    retq # sched: [1:1.00]
14383;
14384; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
14385; SKX:       # %bb.0:
14386; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
14387; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14388; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
14389; SKX-NEXT:    retq # sched: [7:1.00]
14390  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14391  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14392  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14393  ret <16 x float> %res
14394}
14395
14396define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
14397; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
14398; GENERIC:       # %bb.0:
14399; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14400; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14401; GENERIC-NEXT:    retq # sched: [1:1.00]
14402;
14403; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
14404; SKX:       # %bb.0:
14405; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14406; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14407; SKX-NEXT:    retq # sched: [7:1.00]
14408  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14409  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14410  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14411  ret <16 x float> %res
14412}
14413define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
14414; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1:
14415; GENERIC:       # %bb.0:
14416; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
14417; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14418; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
14419; GENERIC-NEXT:    retq # sched: [1:1.00]
14420;
14421; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
14422; SKX:       # %bb.0:
14423; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
14424; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14425; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
14426; SKX-NEXT:    retq # sched: [7:1.00]
14427  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14428  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14429  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14430  ret <16 x float> %res
14431}
14432
14433define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
14434; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
14435; GENERIC:       # %bb.0:
14436; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14437; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14438; GENERIC-NEXT:    retq # sched: [1:1.00]
14439;
14440; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
14441; SKX:       # %bb.0:
14442; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14443; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14444; SKX-NEXT:    retq # sched: [7:1.00]
14445  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14446  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14447  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14448  ret <16 x float> %res
14449}
14450define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
14451; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2:
14452; GENERIC:       # %bb.0:
14453; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
14454; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14455; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
14456; GENERIC-NEXT:    retq # sched: [1:1.00]
14457;
14458; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
14459; SKX:       # %bb.0:
14460; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
14461; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14462; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
14463; SKX-NEXT:    retq # sched: [7:1.00]
14464  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14465  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14466  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14467  ret <16 x float> %res
14468}
14469
14470define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
14471; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
14472; GENERIC:       # %bb.0:
14473; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14474; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14475; GENERIC-NEXT:    retq # sched: [1:1.00]
14476;
14477; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
14478; SKX:       # %bb.0:
14479; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14480; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14481; SKX-NEXT:    retq # sched: [7:1.00]
14482  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14483  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14484  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14485  ret <16 x float> %res
14486}
14487define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
14488; GENERIC-LABEL: test_16xfloat_unpack_high_mask3:
14489; GENERIC:       # %bb.0:
14490; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14491; GENERIC-NEXT:    retq # sched: [1:1.00]
14492;
14493; SKX-LABEL: test_16xfloat_unpack_high_mask3:
14494; SKX:       # %bb.0:
14495; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14496; SKX-NEXT:    retq # sched: [7:1.00]
14497  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14498  ret <16 x float> %res
14499}
14500define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
14501; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3:
14502; GENERIC:       # %bb.0:
14503; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
14504; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14505; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
14506; GENERIC-NEXT:    retq # sched: [1:1.00]
14507;
14508; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
14509; SKX:       # %bb.0:
14510; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
14511; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14512; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
14513; SKX-NEXT:    retq # sched: [7:1.00]
14514  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14515  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14516  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14517  ret <16 x float> %res
14518}
14519
14520define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
14521; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
14522; GENERIC:       # %bb.0:
14523; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14524; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14525; GENERIC-NEXT:    retq # sched: [1:1.00]
14526;
14527; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
14528; SKX:       # %bb.0:
14529; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14530; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
14531; SKX-NEXT:    retq # sched: [7:1.00]
14532  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14533  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14534  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14535  ret <16 x float> %res
14536}
14537define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
14538; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask0:
14539; GENERIC:       # %bb.0:
14540; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14541; GENERIC-NEXT:    retq # sched: [1:1.00]
14542;
14543; SKX-LABEL: test_16xfloat_unpack_high_mem_mask0:
14544; SKX:       # %bb.0:
14545; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14546; SKX-NEXT:    retq # sched: [7:1.00]
14547  %vec2 = load <16 x float>, <16 x float>* %vec2p
14548  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14549  ret <16 x float> %res
14550}
14551define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
14552; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
14553; GENERIC:       # %bb.0:
14554; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14555; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14556; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
14557; GENERIC-NEXT:    retq # sched: [1:1.00]
14558;
14559; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
14560; SKX:       # %bb.0:
14561; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14562; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14563; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
14564; SKX-NEXT:    retq # sched: [7:1.00]
14565  %vec2 = load <16 x float>, <16 x float>* %vec2p
14566  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14567  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14568  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14569  ret <16 x float> %res
14570}
14571
14572define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
14573; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
14574; GENERIC:       # %bb.0:
14575; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
14576; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14577; GENERIC-NEXT:    retq # sched: [1:1.00]
14578;
14579; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
14580; SKX:       # %bb.0:
14581; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
14582; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14583; SKX-NEXT:    retq # sched: [7:1.00]
14584  %vec2 = load <16 x float>, <16 x float>* %vec2p
14585  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14586  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14587  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14588  ret <16 x float> %res
14589}
14590
14591define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
14592; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
14593; GENERIC:       # %bb.0:
14594; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14595; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14596; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
14597; GENERIC-NEXT:    retq # sched: [1:1.00]
14598;
14599; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
14600; SKX:       # %bb.0:
14601; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14602; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14603; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
14604; SKX-NEXT:    retq # sched: [7:1.00]
14605  %vec2 = load <16 x float>, <16 x float>* %vec2p
14606  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14607  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14608  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14609  ret <16 x float> %res
14610}
14611
14612define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
14613; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
14614; GENERIC:       # %bb.0:
14615; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
14616; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14617; GENERIC-NEXT:    retq # sched: [1:1.00]
14618;
14619; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
14620; SKX:       # %bb.0:
14621; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
14622; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14623; SKX-NEXT:    retq # sched: [7:1.00]
14624  %vec2 = load <16 x float>, <16 x float>* %vec2p
14625  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14626  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14627  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14628  ret <16 x float> %res
14629}
14630
14631define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
14632; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
14633; GENERIC:       # %bb.0:
14634; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14635; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14636; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
14637; GENERIC-NEXT:    retq # sched: [1:1.00]
14638;
14639; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
14640; SKX:       # %bb.0:
14641; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14642; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14643; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
14644; SKX-NEXT:    retq # sched: [7:1.00]
14645  %vec2 = load <16 x float>, <16 x float>* %vec2p
14646  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14647  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14648  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14649  ret <16 x float> %res
14650}
14651
14652define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
14653; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
14654; GENERIC:       # %bb.0:
14655; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
14656; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14657; GENERIC-NEXT:    retq # sched: [1:1.00]
14658;
14659; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
14660; SKX:       # %bb.0:
14661; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
14662; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14663; SKX-NEXT:    retq # sched: [7:1.00]
14664  %vec2 = load <16 x float>, <16 x float>* %vec2p
14665  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14666  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14667  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14668  ret <16 x float> %res
14669}
14670
14671define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
14672; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask3:
14673; GENERIC:       # %bb.0:
14674; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14675; GENERIC-NEXT:    retq # sched: [1:1.00]
14676;
14677; SKX-LABEL: test_16xfloat_unpack_high_mem_mask3:
14678; SKX:       # %bb.0:
14679; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14680; SKX-NEXT:    retq # sched: [7:1.00]
14681  %vec2 = load <16 x float>, <16 x float>* %vec2p
14682  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14683  ret <16 x float> %res
14684}
14685define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
14686; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
14687; GENERIC:       # %bb.0:
14688; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
14689; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14690; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
14691; GENERIC-NEXT:    retq # sched: [1:1.00]
14692;
14693; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
14694; SKX:       # %bb.0:
14695; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
14696; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14697; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
14698; SKX-NEXT:    retq # sched: [7:1.00]
14699  %vec2 = load <16 x float>, <16 x float>* %vec2p
14700  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14701  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14702  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
14703  ret <16 x float> %res
14704}
14705
14706define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
14707; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
14708; GENERIC:       # %bb.0:
14709; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
14710; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14711; GENERIC-NEXT:    retq # sched: [1:1.00]
14712;
14713; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
14714; SKX:       # %bb.0:
14715; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
14716; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
14717; SKX-NEXT:    retq # sched: [7:1.00]
14718  %vec2 = load <16 x float>, <16 x float>* %vec2p
14719  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
14720  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
14721  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
14722  ret <16 x float> %res
14723}
14724
14725define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) {
14726; GENERIC-LABEL: test_2xdouble_unpack_high_mask0:
14727; GENERIC:       # %bb.0:
14728; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
14729; GENERIC-NEXT:    retq # sched: [1:1.00]
14730;
14731; SKX-LABEL: test_2xdouble_unpack_high_mask0:
14732; SKX:       # %bb.0:
14733; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
14734; SKX-NEXT:    retq # sched: [7:1.00]
14735  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14736  ret <2 x double> %res
14737}
14738define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
14739; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0:
14740; GENERIC:       # %bb.0:
14741; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
14742; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
14743; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
14744; GENERIC-NEXT:    retq # sched: [1:1.00]
14745;
14746; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0:
14747; SKX:       # %bb.0:
14748; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
14749; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
14750; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
14751; SKX-NEXT:    retq # sched: [7:1.00]
14752  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14753  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14754  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
14755  ret <2 x double> %res
14756}
14757
14758define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
14759; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
14760; GENERIC:       # %bb.0:
14761; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
14762; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
14763; GENERIC-NEXT:    retq # sched: [1:1.00]
14764;
14765; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
14766; SKX:       # %bb.0:
14767; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
14768; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
14769; SKX-NEXT:    retq # sched: [7:1.00]
14770  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14771  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14772  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
14773  ret <2 x double> %res
14774}
14775define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
14776; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1:
14777; GENERIC:       # %bb.0:
14778; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
14779; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
14780; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
14781; GENERIC-NEXT:    retq # sched: [1:1.00]
14782;
14783; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1:
14784; SKX:       # %bb.0:
14785; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
14786; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
14787; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
14788; SKX-NEXT:    retq # sched: [7:1.00]
14789  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14790  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14791  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
14792  ret <2 x double> %res
14793}
14794
14795define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
14796; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
14797; GENERIC:       # %bb.0:
14798; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
14799; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
14800; GENERIC-NEXT:    retq # sched: [1:1.00]
14801;
14802; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
14803; SKX:       # %bb.0:
14804; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
14805; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
14806; SKX-NEXT:    retq # sched: [7:1.00]
14807  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14808  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14809  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
14810  ret <2 x double> %res
14811}
14812define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
14813; GENERIC-LABEL: test_2xdouble_unpack_high_mem_mask0:
14814; GENERIC:       # %bb.0:
14815; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
14816; GENERIC-NEXT:    retq # sched: [1:1.00]
14817;
14818; SKX-LABEL: test_2xdouble_unpack_high_mem_mask0:
14819; SKX:       # %bb.0:
14820; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
14821; SKX-NEXT:    retq # sched: [7:1.00]
14822  %vec2 = load <2 x double>, <2 x double>* %vec2p
14823  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14824  ret <2 x double> %res
14825}
14826define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
14827; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
14828; GENERIC:       # %bb.0:
14829; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
14830; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
14831; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
14832; GENERIC-NEXT:    retq # sched: [1:1.00]
14833;
14834; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
14835; SKX:       # %bb.0:
14836; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
14837; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
14838; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
14839; SKX-NEXT:    retq # sched: [7:1.00]
14840  %vec2 = load <2 x double>, <2 x double>* %vec2p
14841  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14842  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14843  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
14844  ret <2 x double> %res
14845}
14846
14847define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
14848; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
14849; GENERIC:       # %bb.0:
14850; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
14851; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
14852; GENERIC-NEXT:    retq # sched: [1:1.00]
14853;
14854; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
14855; SKX:       # %bb.0:
14856; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
14857; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
14858; SKX-NEXT:    retq # sched: [7:1.00]
14859  %vec2 = load <2 x double>, <2 x double>* %vec2p
14860  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14861  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14862  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
14863  ret <2 x double> %res
14864}
14865
14866define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
14867; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
14868; GENERIC:       # %bb.0:
14869; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
14870; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
14871; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
14872; GENERIC-NEXT:    retq # sched: [1:1.00]
14873;
14874; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
14875; SKX:       # %bb.0:
14876; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
14877; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
14878; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
14879; SKX-NEXT:    retq # sched: [7:1.00]
14880  %vec2 = load <2 x double>, <2 x double>* %vec2p
14881  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14882  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14883  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
14884  ret <2 x double> %res
14885}
14886
14887define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
14888; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
14889; GENERIC:       # %bb.0:
14890; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
14891; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
14892; GENERIC-NEXT:    retq # sched: [1:1.00]
14893;
14894; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
14895; SKX:       # %bb.0:
14896; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
14897; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
14898; SKX-NEXT:    retq # sched: [7:1.00]
14899  %vec2 = load <2 x double>, <2 x double>* %vec2p
14900  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
14901  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
14902  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
14903  ret <2 x double> %res
14904}
14905
14906define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) {
14907; GENERIC-LABEL: test_4xdouble_unpack_high_mask0:
14908; GENERIC:       # %bb.0:
14909; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14910; GENERIC-NEXT:    retq # sched: [1:1.00]
14911;
14912; SKX-LABEL: test_4xdouble_unpack_high_mask0:
14913; SKX:       # %bb.0:
14914; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14915; SKX-NEXT:    retq # sched: [7:1.00]
14916  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
14917  ret <4 x double> %res
14918}
14919define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
14920; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0:
14921; GENERIC:       # %bb.0:
14922; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
14923; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14924; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
14925; GENERIC-NEXT:    retq # sched: [1:1.00]
14926;
14927; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0:
14928; SKX:       # %bb.0:
14929; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
14930; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14931; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
14932; SKX-NEXT:    retq # sched: [7:1.00]
14933  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
14934  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
14935  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
14936  ret <4 x double> %res
14937}
14938
14939define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
14940; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
14941; GENERIC:       # %bb.0:
14942; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
14943; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14944; GENERIC-NEXT:    retq # sched: [1:1.00]
14945;
14946; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
14947; SKX:       # %bb.0:
14948; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
14949; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14950; SKX-NEXT:    retq # sched: [7:1.00]
14951  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
14952  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
14953  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
14954  ret <4 x double> %res
14955}
14956define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
14957; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1:
14958; GENERIC:       # %bb.0:
14959; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
14960; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14961; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
14962; GENERIC-NEXT:    retq # sched: [1:1.00]
14963;
14964; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1:
14965; SKX:       # %bb.0:
14966; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
14967; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14968; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
14969; SKX-NEXT:    retq # sched: [7:1.00]
14970  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
14971  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
14972  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
14973  ret <4 x double> %res
14974}
14975
14976define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
14977; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
14978; GENERIC:       # %bb.0:
14979; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
14980; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14981; GENERIC-NEXT:    retq # sched: [1:1.00]
14982;
14983; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
14984; SKX:       # %bb.0:
14985; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
14986; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14987; SKX-NEXT:    retq # sched: [7:1.00]
14988  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
14989  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
14990  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
14991  ret <4 x double> %res
14992}
14993define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
14994; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2:
14995; GENERIC:       # %bb.0:
14996; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
14997; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
14998; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
14999; GENERIC-NEXT:    retq # sched: [1:1.00]
15000;
15001; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2:
15002; SKX:       # %bb.0:
15003; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
15004; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15005; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
15006; SKX-NEXT:    retq # sched: [7:1.00]
15007  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15008  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15009  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
15010  ret <4 x double> %res
15011}
15012
15013define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
15014; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
15015; GENERIC:       # %bb.0:
15016; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
15017; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15018; GENERIC-NEXT:    retq # sched: [1:1.00]
15019;
15020; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
15021; SKX:       # %bb.0:
15022; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
15023; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15024; SKX-NEXT:    retq # sched: [7:1.00]
15025  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15026  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15027  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
15028  ret <4 x double> %res
15029}
15030define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) {
15031; GENERIC-LABEL: test_4xdouble_unpack_high_mask3:
15032; GENERIC:       # %bb.0:
15033; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15034; GENERIC-NEXT:    retq # sched: [1:1.00]
15035;
15036; SKX-LABEL: test_4xdouble_unpack_high_mask3:
15037; SKX:       # %bb.0:
15038; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15039; SKX-NEXT:    retq # sched: [7:1.00]
15040  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15041  ret <4 x double> %res
15042}
15043define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
15044; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3:
15045; GENERIC:       # %bb.0:
15046; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
15047; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15048; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
15049; GENERIC-NEXT:    retq # sched: [1:1.00]
15050;
15051; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3:
15052; SKX:       # %bb.0:
15053; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
15054; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15055; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
15056; SKX-NEXT:    retq # sched: [7:1.00]
15057  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15058  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15059  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
15060  ret <4 x double> %res
15061}
15062
15063define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
15064; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
15065; GENERIC:       # %bb.0:
15066; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
15067; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15068; GENERIC-NEXT:    retq # sched: [1:1.00]
15069;
15070; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
15071; SKX:       # %bb.0:
15072; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
15073; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
15074; SKX-NEXT:    retq # sched: [7:1.00]
15075  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15076  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15077  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
15078  ret <4 x double> %res
15079}
15080define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
15081; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask0:
15082; GENERIC:       # %bb.0:
15083; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15084; GENERIC-NEXT:    retq # sched: [1:1.00]
15085;
15086; SKX-LABEL: test_4xdouble_unpack_high_mem_mask0:
15087; SKX:       # %bb.0:
15088; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15089; SKX-NEXT:    retq # sched: [7:1.00]
15090  %vec2 = load <4 x double>, <4 x double>* %vec2p
15091  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15092  ret <4 x double> %res
15093}
15094define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
15095; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
15096; GENERIC:       # %bb.0:
15097; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
15098; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15099; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
15100; GENERIC-NEXT:    retq # sched: [1:1.00]
15101;
15102; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
15103; SKX:       # %bb.0:
15104; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
15105; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15106; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
15107; SKX-NEXT:    retq # sched: [7:1.00]
15108  %vec2 = load <4 x double>, <4 x double>* %vec2p
15109  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15110  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15111  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
15112  ret <4 x double> %res
15113}
15114
15115define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
15116; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
15117; GENERIC:       # %bb.0:
15118; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
15119; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15120; GENERIC-NEXT:    retq # sched: [1:1.00]
15121;
15122; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
15123; SKX:       # %bb.0:
15124; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
15125; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15126; SKX-NEXT:    retq # sched: [7:1.00]
15127  %vec2 = load <4 x double>, <4 x double>* %vec2p
15128  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15129  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15130  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
15131  ret <4 x double> %res
15132}
15133
15134define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
15135; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
15136; GENERIC:       # %bb.0:
15137; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
15138; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15139; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
15140; GENERIC-NEXT:    retq # sched: [1:1.00]
15141;
15142; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
15143; SKX:       # %bb.0:
15144; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
15145; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15146; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
15147; SKX-NEXT:    retq # sched: [7:1.00]
15148  %vec2 = load <4 x double>, <4 x double>* %vec2p
15149  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15150  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15151  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
15152  ret <4 x double> %res
15153}
15154
15155define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
15156; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
15157; GENERIC:       # %bb.0:
15158; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
15159; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15160; GENERIC-NEXT:    retq # sched: [1:1.00]
15161;
15162; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
15163; SKX:       # %bb.0:
15164; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
15165; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15166; SKX-NEXT:    retq # sched: [7:1.00]
15167  %vec2 = load <4 x double>, <4 x double>* %vec2p
15168  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15169  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15170  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
15171  ret <4 x double> %res
15172}
15173
15174define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
15175; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
15176; GENERIC:       # %bb.0:
15177; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
15178; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15179; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
15180; GENERIC-NEXT:    retq # sched: [1:1.00]
15181;
15182; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
15183; SKX:       # %bb.0:
15184; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
15185; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15186; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
15187; SKX-NEXT:    retq # sched: [7:1.00]
15188  %vec2 = load <4 x double>, <4 x double>* %vec2p
15189  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15190  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15191  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
15192  ret <4 x double> %res
15193}
15194
15195define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
15196; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
15197; GENERIC:       # %bb.0:
15198; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
15199; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15200; GENERIC-NEXT:    retq # sched: [1:1.00]
15201;
15202; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
15203; SKX:       # %bb.0:
15204; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
15205; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15206; SKX-NEXT:    retq # sched: [7:1.00]
15207  %vec2 = load <4 x double>, <4 x double>* %vec2p
15208  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15209  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15210  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
15211  ret <4 x double> %res
15212}
15213
15214define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
15215; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask3:
15216; GENERIC:       # %bb.0:
15217; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15218; GENERIC-NEXT:    retq # sched: [1:1.00]
15219;
15220; SKX-LABEL: test_4xdouble_unpack_high_mem_mask3:
15221; SKX:       # %bb.0:
15222; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15223; SKX-NEXT:    retq # sched: [7:1.00]
15224  %vec2 = load <4 x double>, <4 x double>* %vec2p
15225  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15226  ret <4 x double> %res
15227}
15228define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
15229; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
15230; GENERIC:       # %bb.0:
15231; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
15232; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15233; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
15234; GENERIC-NEXT:    retq # sched: [1:1.00]
15235;
15236; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
15237; SKX:       # %bb.0:
15238; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
15239; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15240; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
15241; SKX-NEXT:    retq # sched: [7:1.00]
15242  %vec2 = load <4 x double>, <4 x double>* %vec2p
15243  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15244  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15245  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
15246  ret <4 x double> %res
15247}
15248
15249define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
15250; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
15251; GENERIC:       # %bb.0:
15252; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
15253; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15254; GENERIC-NEXT:    retq # sched: [1:1.00]
15255;
15256; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
15257; SKX:       # %bb.0:
15258; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
15259; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
15260; SKX-NEXT:    retq # sched: [7:1.00]
15261  %vec2 = load <4 x double>, <4 x double>* %vec2p
15262  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
15263  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
15264  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
15265  ret <4 x double> %res
15266}
15267
15268define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
15269; GENERIC-LABEL: test_8xdouble_unpack_high_mask0:
15270; GENERIC:       # %bb.0:
15271; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15272; GENERIC-NEXT:    retq # sched: [1:1.00]
15273;
15274; SKX-LABEL: test_8xdouble_unpack_high_mask0:
15275; SKX:       # %bb.0:
15276; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15277; SKX-NEXT:    retq # sched: [7:1.00]
15278  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15279  ret <8 x double> %res
15280}
15281define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
15282; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0:
15283; GENERIC:       # %bb.0:
15284; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
15285; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15286; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
15287; GENERIC-NEXT:    retq # sched: [1:1.00]
15288;
15289; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
15290; SKX:       # %bb.0:
15291; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
15292; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15293; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
15294; SKX-NEXT:    retq # sched: [7:1.00]
15295  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15296  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15297  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15298  ret <8 x double> %res
15299}
15300
15301define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
15302; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
15303; GENERIC:       # %bb.0:
15304; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15305; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15306; GENERIC-NEXT:    retq # sched: [1:1.00]
15307;
15308; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
15309; SKX:       # %bb.0:
15310; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15311; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15312; SKX-NEXT:    retq # sched: [7:1.00]
15313  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15314  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15315  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15316  ret <8 x double> %res
15317}
15318define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
15319; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1:
15320; GENERIC:       # %bb.0:
15321; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
15322; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15323; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
15324; GENERIC-NEXT:    retq # sched: [1:1.00]
15325;
15326; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
15327; SKX:       # %bb.0:
15328; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
15329; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15330; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
15331; SKX-NEXT:    retq # sched: [7:1.00]
15332  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15333  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15334  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15335  ret <8 x double> %res
15336}
15337
15338define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
15339; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
15340; GENERIC:       # %bb.0:
15341; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15342; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15343; GENERIC-NEXT:    retq # sched: [1:1.00]
15344;
15345; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
15346; SKX:       # %bb.0:
15347; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15348; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15349; SKX-NEXT:    retq # sched: [7:1.00]
15350  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15351  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15352  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15353  ret <8 x double> %res
15354}
15355define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
15356; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2:
15357; GENERIC:       # %bb.0:
15358; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
15359; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15360; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
15361; GENERIC-NEXT:    retq # sched: [1:1.00]
15362;
15363; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
15364; SKX:       # %bb.0:
15365; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
15366; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15367; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
15368; SKX-NEXT:    retq # sched: [7:1.00]
15369  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15370  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15371  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15372  ret <8 x double> %res
15373}
15374
15375define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
15376; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
15377; GENERIC:       # %bb.0:
15378; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15379; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15380; GENERIC-NEXT:    retq # sched: [1:1.00]
15381;
15382; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
15383; SKX:       # %bb.0:
15384; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15385; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15386; SKX-NEXT:    retq # sched: [7:1.00]
15387  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15388  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15389  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15390  ret <8 x double> %res
15391}
15392define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
15393; GENERIC-LABEL: test_8xdouble_unpack_high_mask3:
15394; GENERIC:       # %bb.0:
15395; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15396; GENERIC-NEXT:    retq # sched: [1:1.00]
15397;
15398; SKX-LABEL: test_8xdouble_unpack_high_mask3:
15399; SKX:       # %bb.0:
15400; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15401; SKX-NEXT:    retq # sched: [7:1.00]
15402  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15403  ret <8 x double> %res
15404}
15405define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
15406; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3:
15407; GENERIC:       # %bb.0:
15408; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
15409; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15410; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
15411; GENERIC-NEXT:    retq # sched: [1:1.00]
15412;
15413; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
15414; SKX:       # %bb.0:
15415; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
15416; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15417; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
15418; SKX-NEXT:    retq # sched: [7:1.00]
15419  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15420  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15421  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15422  ret <8 x double> %res
15423}
15424
15425define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
15426; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
15427; GENERIC:       # %bb.0:
15428; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15429; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15430; GENERIC-NEXT:    retq # sched: [1:1.00]
15431;
15432; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
15433; SKX:       # %bb.0:
15434; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15435; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
15436; SKX-NEXT:    retq # sched: [7:1.00]
15437  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15438  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15439  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15440  ret <8 x double> %res
15441}
15442define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
15443; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask0:
15444; GENERIC:       # %bb.0:
15445; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15446; GENERIC-NEXT:    retq # sched: [1:1.00]
15447;
15448; SKX-LABEL: test_8xdouble_unpack_high_mem_mask0:
15449; SKX:       # %bb.0:
15450; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15451; SKX-NEXT:    retq # sched: [7:1.00]
15452  %vec2 = load <8 x double>, <8 x double>* %vec2p
15453  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15454  ret <8 x double> %res
15455}
15456define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
15457; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
15458; GENERIC:       # %bb.0:
15459; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15460; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15461; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
15462; GENERIC-NEXT:    retq # sched: [1:1.00]
15463;
15464; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
15465; SKX:       # %bb.0:
15466; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15467; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15468; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
15469; SKX-NEXT:    retq # sched: [7:1.00]
15470  %vec2 = load <8 x double>, <8 x double>* %vec2p
15471  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15472  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15473  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15474  ret <8 x double> %res
15475}
15476
15477define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
15478; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
15479; GENERIC:       # %bb.0:
15480; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
15481; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15482; GENERIC-NEXT:    retq # sched: [1:1.00]
15483;
15484; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
15485; SKX:       # %bb.0:
15486; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
15487; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15488; SKX-NEXT:    retq # sched: [7:1.00]
15489  %vec2 = load <8 x double>, <8 x double>* %vec2p
15490  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15491  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15492  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15493  ret <8 x double> %res
15494}
15495
15496define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
15497; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
15498; GENERIC:       # %bb.0:
15499; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15500; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15501; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
15502; GENERIC-NEXT:    retq # sched: [1:1.00]
15503;
15504; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
15505; SKX:       # %bb.0:
15506; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15507; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15508; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
15509; SKX-NEXT:    retq # sched: [7:1.00]
15510  %vec2 = load <8 x double>, <8 x double>* %vec2p
15511  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15512  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15513  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15514  ret <8 x double> %res
15515}
15516
15517define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
15518; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
15519; GENERIC:       # %bb.0:
15520; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
15521; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15522; GENERIC-NEXT:    retq # sched: [1:1.00]
15523;
15524; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
15525; SKX:       # %bb.0:
15526; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
15527; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15528; SKX-NEXT:    retq # sched: [7:1.00]
15529  %vec2 = load <8 x double>, <8 x double>* %vec2p
15530  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15531  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15532  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15533  ret <8 x double> %res
15534}
15535
15536define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
15537; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
15538; GENERIC:       # %bb.0:
15539; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15540; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15541; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
15542; GENERIC-NEXT:    retq # sched: [1:1.00]
15543;
15544; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
15545; SKX:       # %bb.0:
15546; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15547; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15548; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
15549; SKX-NEXT:    retq # sched: [7:1.00]
15550  %vec2 = load <8 x double>, <8 x double>* %vec2p
15551  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15552  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15553  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15554  ret <8 x double> %res
15555}
15556
15557define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
15558; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
15559; GENERIC:       # %bb.0:
15560; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
15561; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15562; GENERIC-NEXT:    retq # sched: [1:1.00]
15563;
15564; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
15565; SKX:       # %bb.0:
15566; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
15567; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15568; SKX-NEXT:    retq # sched: [7:1.00]
15569  %vec2 = load <8 x double>, <8 x double>* %vec2p
15570  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15571  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15572  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15573  ret <8 x double> %res
15574}
15575
15576define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
15577; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask3:
15578; GENERIC:       # %bb.0:
15579; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15580; GENERIC-NEXT:    retq # sched: [1:1.00]
15581;
15582; SKX-LABEL: test_8xdouble_unpack_high_mem_mask3:
15583; SKX:       # %bb.0:
15584; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15585; SKX-NEXT:    retq # sched: [7:1.00]
15586  %vec2 = load <8 x double>, <8 x double>* %vec2p
15587  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15588  ret <8 x double> %res
15589}
15590define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
15591; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
15592; GENERIC:       # %bb.0:
15593; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
15594; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15595; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
15596; GENERIC-NEXT:    retq # sched: [1:1.00]
15597;
15598; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
15599; SKX:       # %bb.0:
15600; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
15601; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15602; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
15603; SKX-NEXT:    retq # sched: [7:1.00]
15604  %vec2 = load <8 x double>, <8 x double>* %vec2p
15605  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15606  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15607  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
15608  ret <8 x double> %res
15609}
15610
15611define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
15612; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
15613; GENERIC:       # %bb.0:
15614; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
15615; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15616; GENERIC-NEXT:    retq # sched: [1:1.00]
15617;
15618; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
15619; SKX:       # %bb.0:
15620; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
15621; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
15622; SKX-NEXT:    retq # sched: [7:1.00]
15623  %vec2 = load <8 x double>, <8 x double>* %vec2p
15624  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
15625  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
15626  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
15627  ret <8 x double> %res
15628}
15629
15630