• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
5
6define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
7; AVX1-LABEL: load_factorf64_4:
8; AVX1:       # %bb.0:
9; AVX1-NEXT:    vmovupd (%rdi), %ymm0
10; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
11; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
12; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
13; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
14; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
15; AVX1-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
16; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
17; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
18; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
19; AVX1-NEXT:    vaddpd %ymm2, %ymm4, %ymm2
20; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
21; AVX1-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
22; AVX1-NEXT:    retq
23;
24; AVX-LABEL: load_factorf64_4:
25; AVX:       # %bb.0:
26; AVX-NEXT:    vmovupd (%rdi), %ymm0
27; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
28; AVX-NEXT:    vmovupd 64(%rdi), %ymm2
29; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
30; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
31; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
32; AVX-NEXT:    vhaddpd %ymm5, %ymm4, %ymm4
33; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
34; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
35; AVX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
36; AVX-NEXT:    vaddpd %ymm2, %ymm4, %ymm2
37; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
38; AVX-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
39; AVX-NEXT:    retq
40  %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
41  %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
42  %strided.v1 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
43  %strided.v2 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
44  %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
45  %add1 = fadd <4 x double> %strided.v0, %strided.v1
46  %add2 = fadd <4 x double> %add1, %strided.v2
47  %add3 = fadd <4 x double> %add2, %strided.v3
48  ret <4 x double> %add3
49}
50
51define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
52; AVX1-LABEL: load_factorf64_2:
53; AVX1:       # %bb.0:
54; AVX1-NEXT:    vmovupd (%rdi), %ymm0
55; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
56; AVX1-NEXT:    vmovupd 64(%rdi), %ymm2
57; AVX1-NEXT:    vmovupd 96(%rdi), %ymm3
58; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
59; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
60; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
61; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
62; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
63; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
64; AVX1-NEXT:    vmulpd %ymm0, %ymm4, %ymm0
65; AVX1-NEXT:    retq
66;
67; AVX-LABEL: load_factorf64_2:
68; AVX:       # %bb.0:
69; AVX-NEXT:    vmovupd (%rdi), %ymm0
70; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
71; AVX-NEXT:    vmovupd 64(%rdi), %ymm2
72; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
73; AVX-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
74; AVX-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
75; AVX-NEXT:    vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
76; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
77; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
78; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
79; AVX-NEXT:    vmulpd %ymm0, %ymm4, %ymm0
80; AVX-NEXT:    retq
81  %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
82  %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
83  %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
84  %mul = fmul <4 x double> %strided.v0, %strided.v3
85  ret <4 x double> %mul
86}
87
88define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
89; AVX1-LABEL: load_factorf64_1:
90; AVX1:       # %bb.0:
91; AVX1-NEXT:    vmovupd (%rdi), %ymm0
92; AVX1-NEXT:    vmovupd 32(%rdi), %ymm1
93; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
94; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
95; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
96; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
97; AVX1-NEXT:    retq
98;
99; AVX-LABEL: load_factorf64_1:
100; AVX:       # %bb.0:
101; AVX-NEXT:    vmovupd (%rdi), %ymm0
102; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
103; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
104; AVX-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
105; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
106; AVX-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
107; AVX-NEXT:    retq
108  %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
109  %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
110  %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
111  %mul = fmul <4 x double> %strided.v0, %strided.v3
112  ret <4 x double> %mul
113}
114
115define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
116; AVX1-LABEL: load_factori64_4:
117; AVX1:       # %bb.0:
118; AVX1-NEXT:    vmovups (%rdi), %ymm0
119; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
120; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
121; AVX1-NEXT:    vmovups 96(%rdi), %ymm3
122; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
123; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
124; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
125; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
126; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
127; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
128; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
129; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
130; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm1
131; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
132; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm4
133; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
134; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
135; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
136; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
137; AVX1-NEXT:    vpaddq %xmm1, %xmm5, %xmm1
138; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
139; AVX1-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
140; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
141; AVX1-NEXT:    retq
142;
143; AVX-LABEL: load_factori64_4:
144; AVX:       # %bb.0:
145; AVX-NEXT:    vmovdqu (%rdi), %ymm0
146; AVX-NEXT:    vmovdqu 32(%rdi), %ymm1
147; AVX-NEXT:    vmovdqu 64(%rdi), %ymm2
148; AVX-NEXT:    vmovdqu 96(%rdi), %ymm3
149; AVX-NEXT:    vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
150; AVX-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
151; AVX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
152; AVX-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
153; AVX-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
154; AVX-NEXT:    vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
155; AVX-NEXT:    vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
156; AVX-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
157; AVX-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
158; AVX-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
159; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
160; AVX-NEXT:    retq
161  %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
162  %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
163  %strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
164  %strided.v2 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
165  %strided.v3 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
166  %add1 = add <4 x i64> %strided.v0, %strided.v1
167  %add2 = add <4 x i64> %add1, %strided.v2
168  %add3 = add <4 x i64> %add2, %strided.v3
169  ret <4 x i64> %add3
170}
171
172define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
173; AVX1-LABEL: store_factorf64_4:
174; AVX1:       # %bb.0:
175; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
176; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
177; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
178; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
179; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
180; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
181; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
182; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
183; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
184; AVX1-NEXT:    vmovups %ymm3, 64(%rdi)
185; AVX1-NEXT:    vmovups %ymm4, 32(%rdi)
186; AVX1-NEXT:    vmovups %ymm2, (%rdi)
187; AVX1-NEXT:    vzeroupper
188; AVX1-NEXT:    retq
189;
190; AVX2-LABEL: store_factorf64_4:
191; AVX2:       # %bb.0:
192; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
193; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
194; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
195; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
196; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
197; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
198; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
199; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
200; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
201; AVX2-NEXT:    vmovups %ymm3, 64(%rdi)
202; AVX2-NEXT:    vmovups %ymm4, 32(%rdi)
203; AVX2-NEXT:    vmovups %ymm2, (%rdi)
204; AVX2-NEXT:    vzeroupper
205; AVX2-NEXT:    retq
206;
207; AVX512-LABEL: store_factorf64_4:
208; AVX512:       # %bb.0:
209; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
210; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
211; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
212; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
213; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
214; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
215; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
216; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
217; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
218; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
219; AVX512-NEXT:    vmovups %zmm0, 64(%rdi)
220; AVX512-NEXT:    vmovups %zmm1, (%rdi)
221; AVX512-NEXT:    vzeroupper
222; AVX512-NEXT:    retq
223  %s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
224  %s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
225  %interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
226  store <16 x double> %interleaved.vec, <16 x double>* %ptr, align 16
227  ret void
228}
229
230define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
231; AVX1-LABEL: store_factori64_4:
232; AVX1:       # %bb.0:
233; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
234; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
235; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
236; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
237; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
238; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
239; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
240; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
241; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
242; AVX1-NEXT:    vmovups %ymm3, 64(%rdi)
243; AVX1-NEXT:    vmovups %ymm4, 32(%rdi)
244; AVX1-NEXT:    vmovups %ymm2, (%rdi)
245; AVX1-NEXT:    vzeroupper
246; AVX1-NEXT:    retq
247;
248; AVX2-LABEL: store_factori64_4:
249; AVX2:       # %bb.0:
250; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
251; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
252; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
253; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
254; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
255; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
256; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
257; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
258; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
259; AVX2-NEXT:    vmovups %ymm3, 64(%rdi)
260; AVX2-NEXT:    vmovups %ymm4, 32(%rdi)
261; AVX2-NEXT:    vmovups %ymm2, (%rdi)
262; AVX2-NEXT:    vzeroupper
263; AVX2-NEXT:    retq
264;
265; AVX512-LABEL: store_factori64_4:
266; AVX512:       # %bb.0:
267; AVX512-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm4
268; AVX512-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm5
269; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
270; AVX512-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
271; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
272; AVX512-NEXT:    vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
273; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
274; AVX512-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
275; AVX512-NEXT:    vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
276; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
277; AVX512-NEXT:    vmovups %zmm0, 64(%rdi)
278; AVX512-NEXT:    vmovups %zmm1, (%rdi)
279; AVX512-NEXT:    vzeroupper
280; AVX512-NEXT:    retq
281  %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
282  %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
283  %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
284  store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 16
285  ret void
286}
287
288
289define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, <128 x i8>* %p) {
290; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
291; AVX1:       # %bb.0:
292; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
293; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
294; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
295; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
296; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
297; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
298; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
299; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
300; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm0
301; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
302; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
303; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
304; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
305; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
306; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm3
307; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
308; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
309; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
310; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
311; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
312; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm5
313; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
314; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
315; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
316; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm4
317; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
318; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
319; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
320; AVX1-NEXT:    vmovaps %ymm0, 96(%rdi)
321; AVX1-NEXT:    vmovaps %ymm1, 64(%rdi)
322; AVX1-NEXT:    vmovaps %ymm2, 32(%rdi)
323; AVX1-NEXT:    vmovaps %ymm4, (%rdi)
324; AVX1-NEXT:    vzeroupper
325; AVX1-NEXT:    retq
326;
327; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
328; AVX2:       # %bb.0:
329; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
330; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
331; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
332; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
333; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
334; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
335; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
336; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
337; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm2
338; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm5
339; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
340; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
341; AVX2-NEXT:    vmovdqa %ymm0, 96(%rdi)
342; AVX2-NEXT:    vmovdqa %ymm1, 64(%rdi)
343; AVX2-NEXT:    vmovdqa %ymm5, 32(%rdi)
344; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
345; AVX2-NEXT:    vzeroupper
346; AVX2-NEXT:    retq
347;
348; AVX512-LABEL: interleaved_store_vf32_i8_stride4:
349; AVX512:       # %bb.0:
350; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
351; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
352; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
353; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
354; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
355; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
356; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
357; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
358; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm2
359; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm5
360; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
361; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
362; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
363; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
364; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%rdi)
365; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
366; AVX512-NEXT:    vzeroupper
367; AVX512-NEXT:    retq
368  %v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
369  %v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
370  %interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
371  store <128 x i8> %interleaved.vec, <128 x i8>* %p
372ret void
373}
374
375define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) {
376; AVX1-LABEL: interleaved_store_vf16_i8_stride4:
377; AVX1:       # %bb.0:
378; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
379; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
380; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
381; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
382; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
383; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
384; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
385; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
386; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
387; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
388; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
389; AVX1-NEXT:    vmovaps %ymm1, (%rdi)
390; AVX1-NEXT:    vzeroupper
391; AVX1-NEXT:    retq
392;
393; AVX2-LABEL: interleaved_store_vf16_i8_stride4:
394; AVX2:       # %bb.0:
395; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
396; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
397; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
398; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
399; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
400; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
401; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
402; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
403; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
404; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
405; AVX2-NEXT:    vmovdqa %ymm0, 32(%rdi)
406; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
407; AVX2-NEXT:    vzeroupper
408; AVX2-NEXT:    retq
409;
410; AVX512-LABEL: interleaved_store_vf16_i8_stride4:
411; AVX512:       # %bb.0:
412; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
413; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
414; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
415; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
416; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
417; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
418; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
419; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
420; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
421; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
422; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
423; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdi)
424; AVX512-NEXT:    vzeroupper
425; AVX512-NEXT:    retq
426%v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
427%v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
428%interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
429store <64 x i8> %interleaved.vec, <64 x i8>* %p
430ret void
431}
432
433define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
434; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
435; AVX1:       # %bb.0:
436; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
437; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
438; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
439; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
440; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
441; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
442; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
443; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
444; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
445; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
446; AVX1-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
447; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
448; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
449; AVX1-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
450; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
451; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
452; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
453; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
454; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
455; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
456; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
457; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
458; AVX1-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
459; AVX1-NEXT:    vzeroupper
460; AVX1-NEXT:    retq
461;
462; AVX-LABEL: interleaved_load_vf8_i8_stride4:
463; AVX:       # %bb.0:
464; AVX-NEXT:    vmovdqu (%rdi), %ymm0
465; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
466; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm2
467; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
468; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
469; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
470; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
471; AVX-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
472; AVX-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
473; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
474; AVX-NEXT:    vpaddw %xmm1, %xmm4, %xmm1
475; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
476; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
477; AVX-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
478; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
479; AVX-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
480; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
481; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
482; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
483; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
484; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
485; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
486; AVX-NEXT:    vpmullw %xmm0, %xmm1, %xmm0
487; AVX-NEXT:    vzeroupper
488; AVX-NEXT:    retq
489  %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
490  %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
491  %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
492  %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
493  %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
494
495  %add1 = add <8 x i8> %v1, %v2
496  %add2 = add <8 x i8> %v4, %v3
497  %add3 = mul <8 x i8> %add1, %add2
498  ret <8 x i8> %add3
499}
500
501define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
502; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
503; AVX1:       # %bb.0:
504; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
505; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
506; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
507; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
508; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
509; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
510; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
511; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
512; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
513; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
514; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
515; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
516; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
517; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
518; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
519; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
520; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
521; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
522; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
523; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
524; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
525; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
526; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm3, %xmm3
527; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
528; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
529; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
530; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
531; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
532; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
533; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
534; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
535; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
536; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
537; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
538; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
539; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
540; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
541; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
542; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
543; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
544; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
545; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm5, %xmm0
546; AVX1-NEXT:    vpxor %xmm0, %xmm3, %xmm0
547; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
548; AVX1-NEXT:    vzeroupper
549; AVX1-NEXT:    retq
550;
551; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
552; AVX2:       # %bb.0:
553; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
554; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
555; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
556; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
557; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
558; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
559; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
560; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
561; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
562; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm6
563; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm4
564; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
565; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
566; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
567; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
568; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
569; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
570; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
571; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
572; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
573; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
574; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
575; AVX2-NEXT:    vpcmpeqb %xmm4, %xmm2, %xmm2
576; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
577; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm6
578; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
579; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
580; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
581; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm7
582; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
583; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
584; AVX2-NEXT:    vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
585; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
586; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
587; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
588; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
589; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
590; AVX2-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
591; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
592; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
593; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
594; AVX2-NEXT:    vpcmpeqb %xmm0, %xmm4, %xmm0
595; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
596; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
597; AVX2-NEXT:    vzeroupper
598; AVX2-NEXT:    retq
599;
600; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
601; AVX512:       # %bb.0:
602; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
603; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
604; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
605; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
606; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
607; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
608; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
609; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
610; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
611; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
612; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
613; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
614; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm3[2,3]
615; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
616; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
617; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
618; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
619; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
620; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
621; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
622; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
623; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
624; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
625; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
626; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
627; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
628; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
629; AVX512-NEXT:    vpshufb %xmm7, %xmm4, %xmm3
630; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
631; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
632; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3]
633; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
634; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
635; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
636; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
637; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
638; AVX512-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
639; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
640; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
641; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
642; AVX512-NEXT:    vpcmpeqb %zmm5, %zmm8, %k0
643; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm3, %k1
644; AVX512-NEXT:    kxnorw %k1, %k0, %k0
645; AVX512-NEXT:    vpmovm2b %k0, %zmm0
646; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
647; AVX512-NEXT:    vzeroupper
648; AVX512-NEXT:    retq
649  %wide.vec = load <64 x i8>, <64 x i8>* %ptr
650  %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
651  %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
652  %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
653  %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
654
655  %cmp1 = icmp eq <16 x i8> %v1, %v2
656  %cmp2 = icmp eq <16 x i8> %v3, %v4
657  %res = icmp eq <16 x i1> %cmp1, %cmp2
658
659  ret <16 x i1> %res
660}
661
662define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
663; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
664; AVX1:       # %bb.0:
665; AVX1-NEXT:    vmovdqa (%rdi), %ymm11
666; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm14
667; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
668; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm3
669; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
670; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm12
671; AVX1-NEXT:    vpshufb %xmm6, %xmm12, %xmm5
672; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
673; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
674; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
675; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm13
676; AVX1-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
677; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm5
678; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
679; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
680; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm8
681; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm15
682; AVX1-NEXT:    vpshufb %xmm6, %xmm15, %xmm5
683; AVX1-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
684; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
685; AVX1-NEXT:    vextractf128 $1, %ymm11, %xmm6
686; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm4
687; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm0
688; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
689; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
690; AVX1-NEXT:    vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
691; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
692; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
693; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm5
694; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
695; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
696; AVX1-NEXT:    vpshufb %xmm5, %xmm13, %xmm1
697; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm7
698; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
699; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
700; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
701; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
702; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
703; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
704; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm4
705; AVX1-NEXT:    vpshufb %xmm5, %xmm11, %xmm5
706; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
707; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
708; AVX1-NEXT:    vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
709; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
710; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
711; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
712; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
713; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
714; AVX1-NEXT:    vpshufb %xmm4, %xmm13, %xmm5
715; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm7
716; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
717; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
718; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
719; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm5
720; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
721; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
722; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
723; AVX1-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
724; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
725; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
726; AVX1-NEXT:    vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7]
727; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
728; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
729; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
730; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
731; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
732; AVX1-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
733; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
734; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
735; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
736; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
737; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm2
738; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
739; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
740; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm2
741; AVX1-NEXT:    vpshufb %xmm3, %xmm11, %xmm3
742; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
743; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
744; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
745; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm1
746; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm2
747; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm1
748; AVX1-NEXT:    vpcmpeqb %xmm9, %xmm8, %xmm2
749; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
750; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
751; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm3
752; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm2
753; AVX1-NEXT:    vpcmpeqb %xmm0, %xmm10, %xmm0
754; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
755; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
756; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
757; AVX1-NEXT:    retq
758;
759; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
760; AVX2:       # %bb.0:
761; AVX2-NEXT:    vmovdqa (%rdi), %ymm11
762; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
763; AVX2-NEXT:    vmovdqa 64(%rdi), %ymm7
764; AVX2-NEXT:    vmovdqa 96(%rdi), %ymm5
765; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm9
766; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
767; AVX2-NEXT:    vpshufb %xmm6, %xmm9, %xmm3
768; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
769; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
770; AVX2-NEXT:    vextracti128 $1, %ymm11, %xmm10
771; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
772; AVX2-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
773; AVX2-NEXT:    vpshufb %xmm2, %xmm11, %xmm0
774; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
775; AVX2-NEXT:    vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm4[2,3]
776; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm12
777; AVX2-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
778; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
779; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm13
780; AVX2-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
781; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
782; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
783; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm6
784; AVX2-NEXT:    vpshufb %xmm2, %xmm6, %xmm0
785; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
786; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm7
787; AVX2-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
788; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
789; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
790; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
791; AVX2-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
792; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
793; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
794; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm0
795; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
796; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
797; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
798; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
799; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
800; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
801; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
802; AVX2-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
803; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
804; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
805; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
806; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
807; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
808; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
809; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
810; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
811; AVX2-NEXT:    vpcmpeqb %ymm0, %ymm8, %ymm8
812; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
813; AVX2-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
814; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
815; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
816; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
817; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
818; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
819; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
820; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
821; AVX2-NEXT:    vpshufb %xmm0, %xmm12, %xmm4
822; AVX2-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
823; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
824; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
825; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
826; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
827; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
828; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
829; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
830; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
831; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
832; AVX2-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
833; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
834; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
835; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
836; AVX2-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
837; AVX2-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
838; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
839; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
840; AVX2-NEXT:    vpshufb %xmm2, %xmm12, %xmm4
841; AVX2-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
842; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
843; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
844; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
845; AVX2-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
846; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
847; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
848; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
849; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
850; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
851; AVX2-NEXT:    vpxor %ymm0, %ymm8, %ymm0
852; AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
853; AVX2-NEXT:    retq
854;
855; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
856; AVX512:       # %bb.0:
857; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
858; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm7
859; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
860; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm10
861; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
862; AVX512-NEXT:    vpshufb %xmm6, %xmm10, %xmm3
863; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm4
864; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
865; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm11
866; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
867; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm5
868; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
869; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
870; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm3[0,1],xmm4[2,3]
871; AVX512-NEXT:    vextracti64x4 $1, %zmm7, %ymm5
872; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm12
873; AVX512-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
874; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
875; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm13
876; AVX512-NEXT:    vpshufb %xmm6, %xmm13, %xmm6
877; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
878; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
879; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm14
880; AVX512-NEXT:    vpshufb %xmm2, %xmm14, %xmm4
881; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
882; AVX512-NEXT:    vextracti128 $1, %ymm7, %xmm7
883; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
884; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
885; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
886; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
887; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
888; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
889; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
890; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
891; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
892; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
893; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
894; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm6
895; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
896; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
897; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm5
898; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
899; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
900; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
901; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
902; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
903; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
904; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
905; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
906; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm2[4,5,6,7]
907; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
908; AVX512-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
909; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm4
910; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
911; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
912; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
913; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm6
914; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
915; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
916; AVX512-NEXT:    vpshufb %xmm2, %xmm12, %xmm5
917; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm2
918; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
919; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
920; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm5
921; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
922; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
923; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
924; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
925; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
926; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
927; AVX512-NEXT:    vpshufb %xmm3, %xmm10, %xmm4
928; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
929; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
930; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
931; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
932; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
933; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
934; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
935; AVX512-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
936; AVX512-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
937; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
938; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
939; AVX512-NEXT:    vpshufb %xmm4, %xmm14, %xmm3
940; AVX512-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
941; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
942; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
943; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
944; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
945; AVX512-NEXT:    vpcmpeqb %zmm9, %zmm8, %k0
946; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm2, %k1
947; AVX512-NEXT:    kxnord %k1, %k0, %k0
948; AVX512-NEXT:    vpmovm2b %k0, %zmm0
949; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
950; AVX512-NEXT:    retq
951  %wide.vec = load <128 x i8>, <128 x i8>* %ptr
952  %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
953
954  %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
955
956  %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
957
958  %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
959
960  %cmp1 = icmp eq <32 x i8> %v1, %v2
961  %cmp2 = icmp eq <32 x i8> %v3, %v4
962  %res = icmp eq <32 x i1> %cmp1, %cmp2
963
964  ret <32 x i1> %res
965}
966
967define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
968; AVX1-LABEL: interleaved_store_vf8_i8_stride4:
969; AVX1:       # %bb.0:
970; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
971; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
972; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
973; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
974; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
975; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
976; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
977; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
978; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
979; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
980; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
981; AVX1-NEXT:    vzeroupper
982; AVX1-NEXT:    retq
983;
984; AVX-LABEL: interleaved_store_vf8_i8_stride4:
985; AVX:       # %bb.0:
986; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
987; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
988; AVX-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
989; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
990; AVX-NEXT:    vpshufb %xmm4, %xmm3, %xmm1
991; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
992; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
993; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
994; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
995; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
996; AVX-NEXT:    vmovdqa %ymm0, (%rdi)
997; AVX-NEXT:    vzeroupper
998; AVX-NEXT:    retq
999%v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1000%v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1001%interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
1002store <32 x i8> %interleaved.vec, <32 x i8>* %p
1003ret void
1004}
1005
1006define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
1007; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
1008; AVX1:       # %bb.0:
1009; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
1010; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
1011; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
1012; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
1013; AVX1-NEXT:    vmovdqa 64(%rdi), %xmm4
1014; AVX1-NEXT:    vmovdqa 80(%rdi), %xmm5
1015; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1016; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
1017; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
1018; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
1019; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
1020; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
1021; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
1022; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1023; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1024; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1025; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1026; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm8
1027; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
1028; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1029; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm2
1030; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1031; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1032; AVX1-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1033; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
1034; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm5
1035; AVX1-NEXT:    vorps %ymm2, %ymm5, %ymm2
1036; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
1037; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
1038; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
1039; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
1040; AVX1-NEXT:    vpor %xmm3, %xmm6, %xmm3
1041; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
1042; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1043; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
1044; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
1045; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
1046; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
1047; AVX1-NEXT:    vpaddb %xmm9, %xmm2, %xmm2
1048; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
1049; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1050; AVX1-NEXT:    retq
1051;
1052; AVX-LABEL: interleaved_load_vf32_i8_stride3:
1053; AVX:       # %bb.0:
1054; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1055; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
1056; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
1057; AVX-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1058; AVX-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1059; AVX-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1060; AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1061; AVX-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
1062; AVX-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
1063; AVX-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
1064; AVX-NEXT:    vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1065; AVX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1066; AVX-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1067; AVX-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1068; AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1069; AVX-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
1070; AVX-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
1071; AVX-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
1072; AVX-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1073; AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1074; AVX-NEXT:    retq
1075	%wide.vec = load <96 x i8>, <96 x i8>* %ptr
1076	%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
1077	%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
1078	%v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
1079	%add1 = add <32 x i8> %v1, %v2
1080	%add2 = add <32 x i8> %v3, %add1
1081	ret <32 x i8> %add2
1082}
1083
1084define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
1085; AVX1-LABEL: interleaved_load_vf16_i8_stride3:
1086; AVX1:       # %bb.0:
1087; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
1088; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
1089; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
1090; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1091; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1092; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1093; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1094; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1095; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1096; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1097; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1098; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1099; AVX1-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
1100; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
1101; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
1102; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
1103; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1104; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1105; AVX1-NEXT:    retq
1106;
1107; AVX-LABEL: interleaved_load_vf16_i8_stride3:
1108; AVX:       # %bb.0:
1109; AVX-NEXT:    vmovdqa (%rdi), %xmm0
1110; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
1111; AVX-NEXT:    vmovdqa 32(%rdi), %xmm2
1112; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1113; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1114; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1115; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1116; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1117; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1118; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1119; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1120; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1121; AVX-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
1122; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
1123; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
1124; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
1125; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
1126; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1127; AVX-NEXT:    retq
1128	%wide.vec = load <48 x i8>, <48 x i8>* %ptr
1129	%v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
1130	%v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
1131	%v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
1132	%add1 = add <16 x i8> %v1, %v2
1133	%add2 = add <16 x i8> %v3, %add1
1134	ret <16 x i8> %add2
1135}
1136
1137define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
1138; AVX1-LABEL: interleaved_load_vf8_i8_stride3:
1139; AVX1:       # %bb.0:
1140; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
1141; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1142; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
1143; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
1144; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
1145; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
1146; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
1147; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
1148; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
1149; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
1150; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1151; AVX1-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
1152; AVX1-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
1153; AVX1-NEXT:    vzeroupper
1154; AVX1-NEXT:    retq
1155;
1156; AVX-LABEL: interleaved_load_vf8_i8_stride3:
1157; AVX:       # %bb.0:
1158; AVX-NEXT:    vmovdqa (%rdi), %ymm0
1159; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
1160; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
1161; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
1162; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
1163; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
1164; AVX-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
1165; AVX-NEXT:    vpor %xmm3, %xmm4, %xmm3
1166; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
1167; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
1168; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1169; AVX-NEXT:    vpaddw %xmm0, %xmm3, %xmm0
1170; AVX-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
1171; AVX-NEXT:    vzeroupper
1172; AVX-NEXT:    retq
1173	%wide.vec = load <24 x i8>, <24 x i8>* %ptr
1174	%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32  9,i32 12,i32 15,i32 18,i32 21>
1175	%v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
1176	%v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
1177	%add1 = add <8 x i8> %v1, %v2
1178	%add2 = add <8 x i8> %v3, %add1
1179	ret <8 x i8> %add2
1180}
1181
1182define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
1183; AVX1-LABEL: interleaved_store_vf8_i8_stride3:
1184; AVX1:       # %bb.0:
1185; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1186; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1187; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1188; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1189; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
1190; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
1191; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
1192; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
1193; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
1194; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
1195; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1196; AVX1-NEXT:    vmovq %xmm0, 16(%rdi)
1197; AVX1-NEXT:    vmovdqu %xmm2, (%rdi)
1198; AVX1-NEXT:    retq
1199;
1200; AVX-LABEL: interleaved_store_vf8_i8_stride3:
1201; AVX:       # %bb.0:
1202; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1203; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1204; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1205; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1206; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm1
1207; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
1208; AVX-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
1209; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
1210; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
1211; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
1212; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1213; AVX-NEXT:    vmovq %xmm0, 16(%rdi)
1214; AVX-NEXT:    vmovdqu %xmm2, (%rdi)
1215; AVX-NEXT:    retq
1216%1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1217%2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1218%interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
1219store <24 x i8> %interleaved.vec, <24 x i8>* %p, align 1
1220ret void
1221}
1222
1223define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
1224; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
1225; AVX1:       # %bb.0:
1226; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1227; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1228; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1229; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1230; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1231; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
1232; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
1233; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
1234; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
1235; AVX1-NEXT:    vpor %xmm4, %xmm6, %xmm4
1236; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
1237; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
1238; AVX1-NEXT:    vpor %xmm6, %xmm0, %xmm0
1239; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1240; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
1241; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1242; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
1243; AVX1-NEXT:    vmovdqu %xmm1, 32(%rdi)
1244; AVX1-NEXT:    vmovups %ymm0, (%rdi)
1245; AVX1-NEXT:    vzeroupper
1246; AVX1-NEXT:    retq
1247;
1248; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
1249; AVX2:       # %bb.0:
1250; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1251; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1252; AVX2-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1253; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1254; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1255; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
1256; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
1257; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
1258; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
1259; AVX2-NEXT:    vpor %xmm4, %xmm6, %xmm4
1260; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
1261; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
1262; AVX2-NEXT:    vpor %xmm6, %xmm0, %xmm0
1263; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1264; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
1265; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
1266; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
1267; AVX2-NEXT:    vmovdqu %xmm1, 32(%rdi)
1268; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
1269; AVX2-NEXT:    vzeroupper
1270; AVX2-NEXT:    retq
1271;
1272; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
1273; AVX512:       # %bb.0:
1274; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1275; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1276; AVX512-NEXT:    vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1277; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1278; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1279; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
1280; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm4
1281; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
1282; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
1283; AVX512-NEXT:    vpor %xmm4, %xmm6, %xmm4
1284; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm6
1285; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
1286; AVX512-NEXT:    vpor %xmm6, %xmm0, %xmm0
1287; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
1288; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
1289; AVX512-NEXT:    vpor %xmm2, %xmm1, %xmm1
1290; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
1291; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
1292; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
1293; AVX512-NEXT:    vextracti32x4 $2, %zmm1, 32(%rdi)
1294; AVX512-NEXT:    vzeroupper
1295; AVX512-NEXT:    retq
1296%1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1297%2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1298%interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
1299store <48 x i8> %interleaved.vec, <48 x i8>* %p, align 1
1300ret void
1301}
1302
1303define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) {
1304; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
1305; AVX1:       # %bb.0:
1306; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1307; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1308; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1309; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1310; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1311; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1312; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1313; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
1314; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1315; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1316; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1317; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1318; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1319; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1320; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1321; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1322; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1323; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1324; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1325; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1326; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
1327; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
1328; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm6, %ymm0
1329; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
1330; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
1331; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
1332; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
1333; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
1334; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1335; AVX1-NEXT:    vmovups %ymm2, 64(%rdi)
1336; AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
1337; AVX1-NEXT:    vmovups %ymm0, (%rdi)
1338; AVX1-NEXT:    vzeroupper
1339; AVX1-NEXT:    retq
1340;
1341; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
1342; AVX2:       # %bb.0:
1343; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1344; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1345; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1346; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1347; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1348; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1349; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1350; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1351; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm3
1352; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1353; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
1354; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1355; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
1356; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1357; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
1358; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
1359; AVX2-NEXT:    vmovdqu %ymm2, 32(%rdi)
1360; AVX2-NEXT:    vmovdqu %ymm3, (%rdi)
1361; AVX2-NEXT:    vzeroupper
1362; AVX2-NEXT:    retq
1363;
1364; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
1365; AVX512:       # %bb.0:
1366; AVX512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1367; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1368; AVX512-NEXT:    vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1369; AVX512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1370; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1371; AVX512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1372; AVX512-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1373; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1374; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm3
1375; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1376; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
1377; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1378; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
1379; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1380; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
1381; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm1
1382; AVX512-NEXT:    vmovdqu %ymm0, 64(%rdi)
1383; AVX512-NEXT:    vmovdqu64 %zmm1, (%rdi)
1384; AVX512-NEXT:    vzeroupper
1385; AVX512-NEXT:    retq
1386%1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
1387%2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1388%interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
1389store <96 x i8> %interleaved.vec, <96 x i8>* %p, align 1
1390ret void
1391}
1392
1393define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <192 x i8>* %p) {
1394; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
1395; AVX1:       # %bb.0:
1396; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
1397; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1398; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1399; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
1400; AVX1-NEXT:    vpalignr {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1401; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
1402; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
1403; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm6[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1404; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1405; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
1406; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1407; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
1408; AVX1-NEXT:    vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1409; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm6
1410; AVX1-NEXT:    vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
1411; AVX1-NEXT:    vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1412; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm7
1413; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
1414; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1415; AVX1-NEXT:    vpalignr {{.*#+}} xmm14 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
1416; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1417; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
1418; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1419; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1420; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
1421; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
1422; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
1423; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
1424; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
1425; AVX1-NEXT:    vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
1426; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
1427; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
1428; AVX1-NEXT:    vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
1429; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
1430; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1431; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
1432; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
1433; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
1434; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1435; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
1436; AVX1-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
1437; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm7, %ymm0
1438; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
1439; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
1440; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
1441; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
1442; AVX1-NEXT:    vpshufb %xmm5, %xmm14, %xmm6
1443; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
1444; AVX1-NEXT:    vpshufb %xmm5, %xmm9, %xmm6
1445; AVX1-NEXT:    vpshufb %xmm5, %xmm15, %xmm7
1446; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
1447; AVX1-NEXT:    vpshufb %xmm5, %xmm11, %xmm7
1448; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
1449; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm4
1450; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
1451; AVX1-NEXT:    vpshufb %xmm5, %xmm8, %xmm5
1452; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
1453; AVX1-NEXT:    vmovups %ymm3, 160(%rdi)
1454; AVX1-NEXT:    vmovups %ymm4, 128(%rdi)
1455; AVX1-NEXT:    vmovups %ymm6, 96(%rdi)
1456; AVX1-NEXT:    vmovups %ymm1, 64(%rdi)
1457; AVX1-NEXT:    vmovups %ymm2, 32(%rdi)
1458; AVX1-NEXT:    vmovups %ymm0, (%rdi)
1459; AVX1-NEXT:    vzeroupper
1460; AVX1-NEXT:    retq
1461;
1462; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
1463; AVX2:       # %bb.0:
1464; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1465; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
1466; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1467; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
1468; AVX2-NEXT:    vpalignr {{.*#+}} ymm6 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
1469; AVX2-NEXT:    vpalignr {{.*#+}} ymm7 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
1470; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1471; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1472; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1473; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1474; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
1475; AVX2-NEXT:    vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
1476; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
1477; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
1478; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20]
1479; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
1480; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm5, %ymm6
1481; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1482; AVX2-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
1483; AVX2-NEXT:    vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
1484; AVX2-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
1485; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
1486; AVX2-NEXT:    vpshufb %ymm7, %ymm0, %ymm0
1487; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm2
1488; AVX2-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
1489; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
1490; AVX2-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
1491; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
1492; AVX2-NEXT:    vpshufb %ymm7, %ymm1, %ymm1
1493; AVX2-NEXT:    vmovdqu %ymm1, 160(%rdi)
1494; AVX2-NEXT:    vmovdqu %ymm4, 128(%rdi)
1495; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
1496; AVX2-NEXT:    vmovdqu %ymm5, 32(%rdi)
1497; AVX2-NEXT:    vmovdqu %ymm2, 96(%rdi)
1498; AVX2-NEXT:    vmovdqu %ymm6, (%rdi)
1499; AVX2-NEXT:    vzeroupper
1500; AVX2-NEXT:    retq
1501;
1502; AVX512-LABEL: interleaved_store_vf64_i8_stride3:
1503; AVX512:       # %bb.0:
1504; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53]
1505; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58]
1506; AVX512-NEXT:    vpalignr {{.*#+}} zmm3 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
1507; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
1508; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
1509; AVX512-NEXT:    vpalignr {{.*#+}} zmm2 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
1510; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
1511; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52]
1512; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm3
1513; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
1514; AVX512-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
1515; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1516; AVX512-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
1517; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
1518; AVX512-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
1519; AVX512-NEXT:    vextracti64x4 $1, %zmm2, %ymm2
1520; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1521; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm7
1522; AVX512-NEXT:    vpshufb %ymm4, %ymm7, %ymm7
1523; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1524; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
1525; AVX512-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
1526; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1527; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
1528; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm1
1529; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1530; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
1531; AVX512-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
1532; AVX512-NEXT:    vmovdqu64 %zmm3, 64(%rdi)
1533; AVX512-NEXT:    vmovdqu64 %zmm1, (%rdi)
1534; AVX512-NEXT:    vzeroupper
1535; AVX512-NEXT:    retq
1536%1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1537%2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1538%3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
1539store <192 x i8> %3, <192 x i8>* %p, align 1
1540ret void
1541}
1542
1543define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
1544; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
1545; AVX1:       # %bb.0:
1546; AVX1-NEXT:    vmovdqu (%rdi), %xmm11
1547; AVX1-NEXT:    vmovdqu 16(%rdi), %xmm10
1548; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm8
1549; AVX1-NEXT:    vmovdqu 48(%rdi), %xmm3
1550; AVX1-NEXT:    vmovdqu 64(%rdi), %xmm12
1551; AVX1-NEXT:    vmovdqu 80(%rdi), %xmm9
1552; AVX1-NEXT:    vmovdqu 96(%rdi), %xmm6
1553; AVX1-NEXT:    vmovdqu 112(%rdi), %xmm14
1554; AVX1-NEXT:    vmovdqu 128(%rdi), %xmm13
1555; AVX1-NEXT:    vmovdqu 144(%rdi), %xmm5
1556; AVX1-NEXT:    vmovdqu 160(%rdi), %xmm1
1557; AVX1-NEXT:    vmovdqu 176(%rdi), %xmm15
1558; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1559; AVX1-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
1560; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
1561; AVX1-NEXT:    vpshufb %xmm4, %xmm11, %xmm2
1562; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
1563; AVX1-NEXT:    vpshufb %xmm4, %xmm10, %xmm11
1564; AVX1-NEXT:    vpshufb %xmm4, %xmm12, %xmm12
1565; AVX1-NEXT:    vpshufb %xmm4, %xmm14, %xmm14
1566; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
1567; AVX1-NEXT:    vpshufb %xmm4, %xmm13, %xmm0
1568; AVX1-NEXT:    vpshufb %xmm4, %xmm15, %xmm7
1569; AVX1-NEXT:    vpshufb %xmm4, %xmm8, %xmm13
1570; AVX1-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
1571; AVX1-NEXT:    vpalignr {{.*#+}} xmm15 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1572; AVX1-NEXT:    vpalignr {{.*#+}} xmm10 = xmm13[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1573; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
1574; AVX1-NEXT:    vpalignr {{.*#+}} xmm8 = xmm0[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
1575; AVX1-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1576; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
1577; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
1578; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm6, %ymm7
1579; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10]
1580; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
1581; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm14[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1582; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm14
1583; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1584; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
1585; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm12
1586; AVX1-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1587; AVX1-NEXT:    vandnps %ymm12, %ymm13, %ymm12
1588; AVX1-NEXT:    vandps %ymm13, %ymm14, %ymm14
1589; AVX1-NEXT:    vorps %ymm12, %ymm14, %ymm12
1590; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm14
1591; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1592; AVX1-NEXT:    vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
1593; AVX1-NEXT:    vandnps %ymm14, %ymm13, %ymm14
1594; AVX1-NEXT:    vandps %ymm13, %ymm7, %ymm7
1595; AVX1-NEXT:    vorps %ymm14, %ymm7, %ymm13
1596; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
1597; AVX1-NEXT:    vpshufb %xmm14, %xmm3, %xmm3
1598; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
1599; AVX1-NEXT:    vpshufb %xmm7, %xmm15, %xmm4
1600; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
1601; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm10[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
1602; AVX1-NEXT:    vpshufb %xmm14, %xmm2, %xmm2
1603; AVX1-NEXT:    vpshufb %xmm7, %xmm10, %xmm4
1604; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1605; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1606; AVX1-NEXT:    vpshufb %xmm14, %xmm5, %xmm4
1607; AVX1-NEXT:    vpshufb %xmm7, %xmm9, %xmm5
1608; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
1609; AVX1-NEXT:    vpshufb %xmm14, %xmm6, %xmm5
1610; AVX1-NEXT:    vpalignr {{.*#+}} xmm6 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1611; AVX1-NEXT:    vpshufb %xmm7, %xmm8, %xmm0
1612; AVX1-NEXT:    vpor %xmm5, %xmm0, %xmm5
1613; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm0
1614; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
1615; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm1
1616; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
1617; AVX1-NEXT:    vpaddb -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
1618; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
1619; AVX1-NEXT:    vpaddb %xmm11, %xmm12, %xmm3
1620; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
1621; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1622; AVX1-NEXT:    vpaddb %xmm6, %xmm13, %xmm2
1623; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
1624; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1625; AVX1-NEXT:    retq
1626;
1627; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
1628; AVX2:       # %bb.0:
1629; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
1630; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm1
1631; AVX2-NEXT:    vmovdqu 32(%rdi), %xmm2
1632; AVX2-NEXT:    vmovdqu 96(%rdi), %xmm3
1633; AVX2-NEXT:    vmovdqu 112(%rdi), %xmm4
1634; AVX2-NEXT:    vmovdqu 128(%rdi), %xmm5
1635; AVX2-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1636; AVX2-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1637; AVX2-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1638; AVX2-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1639; AVX2-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm4
1640; AVX2-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm5
1641; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1642; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
1643; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
1644; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
1645; AVX2-NEXT:    vpshufb %ymm6, %ymm4, %ymm4
1646; AVX2-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
1647; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
1648; AVX2-NEXT:    vpalignr {{.*#+}} ymm6 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1649; AVX2-NEXT:    vpalignr {{.*#+}} ymm7 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1650; AVX2-NEXT:    vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
1651; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1652; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1653; AVX2-NEXT:    vpalignr {{.*#+}} ymm2 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
1654; AVX2-NEXT:    vpalignr {{.*#+}} ymm4 = ymm7[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1655; AVX2-NEXT:    vpalignr {{.*#+}} ymm5 = ymm6[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1656; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1657; AVX2-NEXT:    # ymm8 = mem[0,1,0,1]
1658; AVX2-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
1659; AVX2-NEXT:    vpaddb %ymm5, %ymm1, %ymm1
1660; AVX2-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
1661; AVX2-NEXT:    vpaddb %ymm4, %ymm2, %ymm2
1662; AVX2-NEXT:    vpblendvb %ymm8, %ymm6, %ymm0, %ymm0
1663; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1664; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
1665; AVX2-NEXT:    vpblendvb %ymm8, %ymm7, %ymm3, %ymm1
1666; AVX2-NEXT:    vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1667; AVX2-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
1668; AVX2-NEXT:    retq
1669;
1670; AVX512-LABEL: interleaved_load_vf64_i8_stride3:
1671; AVX512:       # %bb.0:
1672; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
1673; AVX512-NEXT:    vmovdqu 16(%rdi), %xmm1
1674; AVX512-NEXT:    vmovdqu 32(%rdi), %xmm2
1675; AVX512-NEXT:    vmovdqu 96(%rdi), %xmm3
1676; AVX512-NEXT:    vmovdqu 112(%rdi), %xmm4
1677; AVX512-NEXT:    vmovdqu 128(%rdi), %xmm5
1678; AVX512-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1679; AVX512-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1680; AVX512-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1681; AVX512-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
1682; AVX512-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm4
1683; AVX512-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm5
1684; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
1685; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
1686; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
1687; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1688; AVX512-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
1689; AVX512-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
1690; AVX512-NEXT:    vpshufb %zmm3, %zmm2, %zmm2
1691; AVX512-NEXT:    vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
1692; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1693; AVX512-NEXT:    movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
1694; AVX512-NEXT:    kmovq %rax, %k1
1695; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1696; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
1697; AVX512-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm5
1698; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
1699; AVX512-NEXT:    vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1700; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
1701; AVX512-NEXT:    vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
1702; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
1703; AVX512-NEXT:    vpalignr {{.*#+}} ymm1 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1704; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
1705; AVX512-NEXT:    vpblendvb %ymm4, %ymm2, %ymm6, %ymm2
1706; AVX512-NEXT:    vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
1707; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
1708; AVX512-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
1709; AVX512-NEXT:    retq
1710%wide.vec = load <192 x i8>, <192 x i8>* %ptr, align 1
1711%v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
1712%v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
1713%v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
1714%add1 = add <64 x i8> %v1, %v2
1715%add2 = add <64 x i8> %v3, %add1
1716ret <64 x i8> %add2
1717}
1718
1719define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) {
1720; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
1721; AVX1:       # %bb.0:
1722; AVX1-NEXT:    subq $24, %rsp
1723; AVX1-NEXT:    .cfi_def_cfa_offset 32
1724; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1725; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm11
1726; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm12
1727; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
1728; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1729; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm13
1730; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm14
1731; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1732; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1733; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1734; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
1735; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1736; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
1737; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1738; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
1739; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1740; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
1741; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
1742; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
1743; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1744; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
1745; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
1746; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1747; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
1748; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
1749; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm3
1750; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
1751; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
1752; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1753; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1754; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
1755; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
1756; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm1
1757; AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) # 32-byte Spill
1758; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
1759; AVX1-NEXT:    vmovdqa %xmm8, %xmm2
1760; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
1761; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm8, %ymm13
1762; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
1763; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
1764; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
1765; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
1766; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
1767; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
1768; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
1769; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
1770; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm9, %ymm14
1771; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
1772; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
1773; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1774; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
1775; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
1776; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1777; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm9, %ymm9
1778; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
1779; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
1780; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1781; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1782; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm8, %ymm0
1783; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm13, %ymm8
1784; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm5 = ymm13[2,3],ymm0[2,3]
1785; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm6, %ymm1
1786; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
1787; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
1788; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm10, %ymm2
1789; AVX1-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload
1790; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm3, %ymm0
1791; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
1792; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm7, %ymm3
1793; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm14, %ymm7
1794; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3]
1795; AVX1-NEXT:    vmovaps %ymm3, 224(%rdi)
1796; AVX1-NEXT:    vmovaps %ymm2, 192(%rdi)
1797; AVX1-NEXT:    vmovaps %ymm7, 160(%rdi)
1798; AVX1-NEXT:    vmovaps %ymm0, 128(%rdi)
1799; AVX1-NEXT:    vmovaps %ymm1, 96(%rdi)
1800; AVX1-NEXT:    vmovaps %ymm5, 64(%rdi)
1801; AVX1-NEXT:    vmovaps %ymm6, 32(%rdi)
1802; AVX1-NEXT:    vmovaps %ymm8, (%rdi)
1803; AVX1-NEXT:    addq $24, %rsp
1804; AVX1-NEXT:    .cfi_def_cfa_offset 8
1805; AVX1-NEXT:    vzeroupper
1806; AVX1-NEXT:    retq
1807;
1808; AVX2-LABEL: interleaved_store_vf64_i8_stride4:
1809; AVX2:       # %bb.0:
1810; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1811; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
1812; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1813; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
1814; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
1815; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23]
1816; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
1817; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31]
1818; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11]
1819; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
1820; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15]
1821; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
1822; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1823; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1824; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1825; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1826; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm7, %ymm4
1827; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm9, %ymm5
1828; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
1829; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
1830; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm6, %ymm7
1831; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm8, %ymm9
1832; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
1833; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
1834; AVX2-NEXT:    vmovdqa %ymm1, 224(%rdi)
1835; AVX2-NEXT:    vmovdqa %ymm3, 192(%rdi)
1836; AVX2-NEXT:    vmovdqa %ymm0, 96(%rdi)
1837; AVX2-NEXT:    vmovdqa %ymm2, 64(%rdi)
1838; AVX2-NEXT:    vmovdqa %ymm9, 160(%rdi)
1839; AVX2-NEXT:    vmovdqa %ymm7, 128(%rdi)
1840; AVX2-NEXT:    vmovdqa %ymm5, 32(%rdi)
1841; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1842; AVX2-NEXT:    vzeroupper
1843; AVX2-NEXT:    retq
1844;
1845; AVX512-LABEL: interleaved_store_vf64_i8_stride4:
1846; AVX512:       # %bb.0:
1847; AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
1848; AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
1849; AVX512-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55]
1850; AVX512-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63]
1851; AVX512-NEXT:    vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27]
1852; AVX512-NEXT:    vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31]
1853; AVX512-NEXT:    vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27]
1854; AVX512-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
1855; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm2
1856; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm5
1857; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm1[2,3]
1858; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm0[2,3]
1859; AVX512-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
1860; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1861; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm8
1862; AVX512-NEXT:    vextracti64x4 $1, %zmm4, %ymm4
1863; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1864; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm9
1865; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
1866; AVX512-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
1867; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
1868; AVX512-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
1869; AVX512-NEXT:    vinserti64x4 $1, %ymm9, %zmm8, %zmm4
1870; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1871; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rdi)
1872; AVX512-NEXT:    vmovdqa64 %zmm3, 64(%rdi)
1873; AVX512-NEXT:    vmovdqa64 %zmm4, 128(%rdi)
1874; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
1875; AVX512-NEXT:    vzeroupper
1876; AVX512-NEXT:    retq
1877%1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1878%2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1879%interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
1880store <256 x i8> %interleaved, <256 x i8>* %p
1881ret void
1882}
1883