• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
4
5define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
6; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
7; AVX512:       # %bb.0:
8; AVX512-NEXT:    kmovb (%rdi), %k0
9; AVX512-NEXT:    kshiftrb $4, %k0, %k0
10; AVX512-NEXT:    vpmovm2q %k0, %xmm2
11; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
12; AVX512-NEXT:    vpmovq2m %xmm2, %k1
13; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
14; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
15; AVX512-NEXT:    retq
16;
17; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
18; AVX512NOTDQ:       # %bb.0:
19; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
20; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
21; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
22; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
23; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
24; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
25; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
26; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
27; AVX512NOTDQ-NEXT:    retq
28    %d0 = load <8 x i1>, <8 x i1>* %a0
29    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
30    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
31    store <2 x double> %d2, <2 x double>* %a3
32    ret void
33}
34define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
35; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
36; AVX512:       # %bb.0:
37; AVX512-NEXT:    kmovb (%rdi), %k0
38; AVX512-NEXT:    kshiftrb $6, %k0, %k0
39; AVX512-NEXT:    vpmovm2q %k0, %xmm2
40; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
41; AVX512-NEXT:    vpmovq2m %xmm2, %k1
42; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
43; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
44; AVX512-NEXT:    retq
45;
46; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
47; AVX512NOTDQ:       # %bb.0:
48; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
49; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
50; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
51; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
52; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
53; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
54; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
55; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
56; AVX512NOTDQ-NEXT:    retq
57    %d0 = load <8 x i1>, <8 x i1>* %a0
58    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
59    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
60    store <2 x double> %d2, <2 x double>* %a3
61    ret void
62}
63define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
64; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
65; AVX512:       # %bb.0:
66; AVX512-NEXT:    kmovw (%rdi), %k0
67; AVX512-NEXT:    kshiftrw $8, %k0, %k0
68; AVX512-NEXT:    vpmovm2q %k0, %xmm2
69; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
70; AVX512-NEXT:    vpmovq2m %xmm2, %k1
71; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
72; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
73; AVX512-NEXT:    retq
74;
75; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
76; AVX512NOTDQ:       # %bb.0:
77; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
78; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
79; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
80; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
81; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
82; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
83; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
84; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
85; AVX512NOTDQ-NEXT:    retq
86    %d0 = load <16 x i1>, <16 x i1>* %a0
87    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
88    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
89    store <2 x double> %d2, <2 x double>* %a3
90    ret void
91}
92define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
93; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
94; AVX512:       # %bb.0:
95; AVX512-NEXT:    kmovw (%rdi), %k0
96; AVX512-NEXT:    kshiftrw $8, %k0, %k0
97; AVX512-NEXT:    vpmovm2d %k0, %xmm2
98; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
99; AVX512-NEXT:    vpmovd2m %xmm2, %k1
100; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
101; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
102; AVX512-NEXT:    retq
103;
104; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
105; AVX512NOTDQ:       # %bb.0:
106; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
107; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
108; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
109; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
110; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
111; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
112; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
113; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
114; AVX512NOTDQ-NEXT:    retq
115    %d0 = load <16 x i1>, <16 x i1>* %a0
116    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
117    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
118    store <4 x float> %d2, <4 x float>* %a3
119    ret void
120}
121define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
122; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
123; AVX512:       # %bb.0:
124; AVX512-NEXT:    kmovw (%rdi), %k0
125; AVX512-NEXT:    kshiftrw $14, %k0, %k0
126; AVX512-NEXT:    vpmovm2q %k0, %xmm2
127; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
128; AVX512-NEXT:    vpmovq2m %xmm2, %k1
129; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
130; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
131; AVX512-NEXT:    retq
132;
133; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
134; AVX512NOTDQ:       # %bb.0:
135; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
136; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
137; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
138; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
139; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
140; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
141; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
142; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
143; AVX512NOTDQ-NEXT:    retq
144    %d0 = load <16 x i1>, <16 x i1>* %a0
145    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
146    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
147    store <2 x double> %d2, <2 x double>* %a3
148    ret void
149}
150define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
151; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
152; AVX512:       # %bb.0:
153; AVX512-NEXT:    kmovw (%rdi), %k0
154; AVX512-NEXT:    kshiftrw $12, %k0, %k0
155; AVX512-NEXT:    vpmovm2d %k0, %xmm2
156; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
157; AVX512-NEXT:    vpmovd2m %xmm2, %k1
158; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
159; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
160; AVX512-NEXT:    retq
161;
162; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
163; AVX512NOTDQ:       # %bb.0:
164; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
165; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
166; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
167; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
168; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
169; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
170; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
171; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
172; AVX512NOTDQ-NEXT:    retq
173    %d0 = load <16 x i1>, <16 x i1>* %a0
174    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
175    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
176    store <4 x float> %d2, <4 x float>* %a3
177    ret void
178}
179define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
180; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
181; AVX512:       # %bb.0:
182; AVX512-NEXT:    kmovd (%rdi), %k0
183; AVX512-NEXT:    kshiftrd $16, %k0, %k0
184; AVX512-NEXT:    vpmovm2q %k0, %xmm2
185; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
186; AVX512-NEXT:    vpmovq2m %xmm2, %k1
187; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
188; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
189; AVX512-NEXT:    retq
190;
191; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
192; AVX512NOTDQ:       # %bb.0:
193; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
194; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
195; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
196; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
197; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
198; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
199; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
200; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
201; AVX512NOTDQ-NEXT:    retq
202    %d0 = load <32 x i1>, <32 x i1>* %a0
203    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
204    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
205    store <2 x double> %d2, <2 x double>* %a3
206    ret void
207}
208define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
209; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
210; AVX512:       # %bb.0:
211; AVX512-NEXT:    kmovd (%rdi), %k0
212; AVX512-NEXT:    kshiftrd $16, %k0, %k0
213; AVX512-NEXT:    vpmovm2d %k0, %xmm2
214; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
215; AVX512-NEXT:    vpmovd2m %xmm2, %k1
216; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
217; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
218; AVX512-NEXT:    retq
219;
220; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
221; AVX512NOTDQ:       # %bb.0:
222; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
223; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
224; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
225; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
226; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
227; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
228; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
229; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
230; AVX512NOTDQ-NEXT:    retq
231    %d0 = load <32 x i1>, <32 x i1>* %a0
232    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
233    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
234    store <4 x float> %d2, <4 x float>* %a3
235    ret void
236}
237define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
238; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
239; AVX512:       # %bb.0:
240; AVX512-NEXT:    kmovb 2(%rdi), %k0
241; AVX512-NEXT:    vpmovm2d %k0, %ymm2
242; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
243; AVX512-NEXT:    vpmovd2m %ymm2, %k1
244; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
245; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
246; AVX512-NEXT:    vzeroupper
247; AVX512-NEXT:    retq
248;
249; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
250; AVX512NOTDQ:       # %bb.0:
251; AVX512NOTDQ-NEXT:    kmovw 2(%rdi), %k1
252; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
253; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
254; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
255; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
256; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
257; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
258; AVX512NOTDQ-NEXT:    vzeroupper
259; AVX512NOTDQ-NEXT:    retq
260    %d0 = load <32 x i1>, <32 x i1>* %a0
261    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
262    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
263    store <8 x float> %d2, <8 x float>* %a3
264    ret void
265}
266define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
267; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
268; AVX512:       # %bb.0:
269; AVX512-NEXT:    kmovd (%rdi), %k0
270; AVX512-NEXT:    kshiftrd $30, %k0, %k0
271; AVX512-NEXT:    vpmovm2q %k0, %xmm2
272; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
273; AVX512-NEXT:    vpmovq2m %xmm2, %k1
274; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
275; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
276; AVX512-NEXT:    retq
277;
278; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
279; AVX512NOTDQ:       # %bb.0:
280; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
281; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
282; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
283; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
284; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
285; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
286; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
287; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
288; AVX512NOTDQ-NEXT:    retq
289    %d0 = load <32 x i1>, <32 x i1>* %a0
290    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
291    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
292    store <2 x double> %d2, <2 x double>* %a3
293    ret void
294}
295define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
296; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
297; AVX512:       # %bb.0:
298; AVX512-NEXT:    kmovd (%rdi), %k0
299; AVX512-NEXT:    kshiftrd $28, %k0, %k0
300; AVX512-NEXT:    vpmovm2d %k0, %xmm2
301; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
302; AVX512-NEXT:    vpmovd2m %xmm2, %k1
303; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
304; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
305; AVX512-NEXT:    retq
306;
307; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
308; AVX512NOTDQ:       # %bb.0:
309; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
310; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
311; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
312; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
313; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
314; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
315; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
316; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
317; AVX512NOTDQ-NEXT:    retq
318    %d0 = load <32 x i1>, <32 x i1>* %a0
319    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
320    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
321    store <4 x float> %d2, <4 x float>* %a3
322    ret void
323}
324define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
325; AVX512-LABEL: load_v32i1_broadcast_31_v8i1:
326; AVX512:       # %bb.0:
327; AVX512-NEXT:    kmovb 3(%rdi), %k0
328; AVX512-NEXT:    vpmovm2d %k0, %ymm2
329; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
330; AVX512-NEXT:    vpermd %ymm2, %ymm3, %ymm2
331; AVX512-NEXT:    vpmovd2m %ymm2, %k1
332; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
333; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
334; AVX512-NEXT:    vzeroupper
335; AVX512-NEXT:    retq
336;
337; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1:
338; AVX512NOTDQ:       # %bb.0:
339; AVX512NOTDQ-NEXT:    movzbl 3(%rdi), %eax
340; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
341; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
342; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
343; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
344; AVX512NOTDQ-NEXT:    vpermd %ymm2, %ymm3, %ymm2
345; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
346; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
347; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
348; AVX512NOTDQ-NEXT:    vzeroupper
349; AVX512NOTDQ-NEXT:    retq
350    %d0 = load <32 x i1>, <32 x i1>* %a0
351    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
352    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
353    store <8 x float> %d2, <8 x float>* %a3
354    ret void
355}
356define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
357; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
358; AVX512:       # %bb.0:
359; AVX512-NEXT:    kmovq (%rdi), %k0
360; AVX512-NEXT:    kshiftrq $32, %k0, %k0
361; AVX512-NEXT:    vpmovm2q %k0, %xmm2
362; AVX512-NEXT:    vpbroadcastq %xmm2, %xmm2
363; AVX512-NEXT:    vpmovq2m %xmm2, %k1
364; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
365; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
366; AVX512-NEXT:    retq
367;
368; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
369; AVX512NOTDQ:       # %bb.0:
370; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
371; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
372; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
373; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
374; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %xmm2
375; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
376; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
377; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
378; AVX512NOTDQ-NEXT:    retq
379    %d0 = load <64 x i1>, <64 x i1>* %a0
380    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
381    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
382    store <2 x double> %d2, <2 x double>* %a3
383    ret void
384}
385define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
386; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
387; AVX512:       # %bb.0:
388; AVX512-NEXT:    kmovq (%rdi), %k0
389; AVX512-NEXT:    kshiftrq $32, %k0, %k0
390; AVX512-NEXT:    vpmovm2d %k0, %xmm2
391; AVX512-NEXT:    vpbroadcastd %xmm2, %xmm2
392; AVX512-NEXT:    vpmovd2m %xmm2, %k1
393; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
394; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
395; AVX512-NEXT:    retq
396;
397; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
398; AVX512NOTDQ:       # %bb.0:
399; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
400; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
401; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
402; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
403; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %xmm2
404; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
405; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
406; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
407; AVX512NOTDQ-NEXT:    retq
408    %d0 = load <64 x i1>, <64 x i1>* %a0
409    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
410    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
411    store <4 x float> %d2, <4 x float>* %a3
412    ret void
413}
414define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
415; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
416; AVX512:       # %bb.0:
417; AVX512-NEXT:    kmovb 4(%rdi), %k0
418; AVX512-NEXT:    vpmovm2d %k0, %ymm2
419; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2
420; AVX512-NEXT:    vpmovd2m %ymm2, %k1
421; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
422; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
423; AVX512-NEXT:    vzeroupper
424; AVX512-NEXT:    retq
425;
426; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
427; AVX512NOTDQ:       # %bb.0:
428; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
429; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
430; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
431; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2
432; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
433; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
434; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
435; AVX512NOTDQ-NEXT:    vzeroupper
436; AVX512NOTDQ-NEXT:    retq
437    %d0 = load <64 x i1>, <64 x i1>* %a0
438    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
439    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
440    store <8 x float> %d2, <8 x float>* %a3
441    ret void
442}
443define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
444; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
445; AVX512:       # %bb.0:
446; AVX512-NEXT:    kmovw 4(%rdi), %k0
447; AVX512-NEXT:    vpmovm2d %k0, %zmm2
448; AVX512-NEXT:    vpbroadcastd %xmm2, %zmm2
449; AVX512-NEXT:    vpmovd2m %zmm2, %k1
450; AVX512-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
451; AVX512-NEXT:    vmovaps %zmm1, (%rsi)
452; AVX512-NEXT:    vzeroupper
453; AVX512-NEXT:    retq
454;
455; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
456; AVX512NOTDQ:       # %bb.0:
457; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
458; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
459; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %zmm2
460; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
461; AVX512NOTDQ-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
462; AVX512NOTDQ-NEXT:    vmovaps %zmm1, (%rsi)
463; AVX512NOTDQ-NEXT:    vzeroupper
464; AVX512NOTDQ-NEXT:    retq
465    %d0 = load <64 x i1>, <64 x i1>* %a0
466    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
467    %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
468    store <16 x float> %d2, <16 x float>* %a3
469    ret void
470}
471define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
472; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
473; AVX512:       # %bb.0:
474; AVX512-NEXT:    kmovq (%rdi), %k0
475; AVX512-NEXT:    kshiftrq $62, %k0, %k0
476; AVX512-NEXT:    vpmovm2q %k0, %xmm2
477; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
478; AVX512-NEXT:    vpmovq2m %xmm2, %k1
479; AVX512-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
480; AVX512-NEXT:    vmovapd %xmm1, (%rsi)
481; AVX512-NEXT:    retq
482;
483; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
484; AVX512NOTDQ:       # %bb.0:
485; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
486; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
487; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
488; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm2, %xmm2 {%k1} {z}
489; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
490; AVX512NOTDQ-NEXT:    vptestmq %xmm2, %xmm2, %k1
491; AVX512NOTDQ-NEXT:    vmovapd %xmm0, %xmm1 {%k1}
492; AVX512NOTDQ-NEXT:    vmovapd %xmm1, (%rsi)
493; AVX512NOTDQ-NEXT:    retq
494    %d0 = load <64 x i1>, <64 x i1>* %a0
495    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
496    %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
497    store <2 x double> %d2, <2 x double>* %a3
498    ret void
499}
500define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
501; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
502; AVX512:       # %bb.0:
503; AVX512-NEXT:    kmovq (%rdi), %k0
504; AVX512-NEXT:    kshiftrq $60, %k0, %k0
505; AVX512-NEXT:    vpmovm2d %k0, %xmm2
506; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
507; AVX512-NEXT:    vpmovd2m %xmm2, %k1
508; AVX512-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
509; AVX512-NEXT:    vmovaps %xmm1, (%rsi)
510; AVX512-NEXT:    retq
511;
512; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
513; AVX512NOTDQ:       # %bb.0:
514; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
515; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
516; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
517; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm2, %xmm2 {%k1} {z}
518; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
519; AVX512NOTDQ-NEXT:    vptestmd %xmm2, %xmm2, %k1
520; AVX512NOTDQ-NEXT:    vmovaps %xmm0, %xmm1 {%k1}
521; AVX512NOTDQ-NEXT:    vmovaps %xmm1, (%rsi)
522; AVX512NOTDQ-NEXT:    retq
523    %d0 = load <64 x i1>, <64 x i1>* %a0
524    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
525    %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
526    store <4 x float> %d2, <4 x float>* %a3
527    ret void
528}
529define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
530; AVX512-LABEL: load_v64i1_broadcast_63_v8i1:
531; AVX512:       # %bb.0:
532; AVX512-NEXT:    kmovb 7(%rdi), %k0
533; AVX512-NEXT:    vpmovm2d %k0, %ymm2
534; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
535; AVX512-NEXT:    vpermd %ymm2, %ymm3, %ymm2
536; AVX512-NEXT:    vpmovd2m %ymm2, %k1
537; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
538; AVX512-NEXT:    vmovaps %ymm1, (%rsi)
539; AVX512-NEXT:    vzeroupper
540; AVX512-NEXT:    retq
541;
542; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1:
543; AVX512NOTDQ:       # %bb.0:
544; AVX512NOTDQ-NEXT:    movzbl 7(%rdi), %eax
545; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
546; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
547; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z}
548; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
549; AVX512NOTDQ-NEXT:    vpermd %ymm2, %ymm3, %ymm2
550; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1
551; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
552; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)
553; AVX512NOTDQ-NEXT:    vzeroupper
554; AVX512NOTDQ-NEXT:    retq
555    %d0 = load <64 x i1>, <64 x i1>* %a0
556    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
557    %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
558    store <8 x float> %d2, <8 x float>* %a3
559    ret void
560}
561define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
562; AVX512-LABEL: load_v64i1_broadcast_63_v16i1:
563; AVX512:       # %bb.0:
564; AVX512-NEXT:    kmovw 6(%rdi), %k0
565; AVX512-NEXT:    vpmovm2d %k0, %zmm2
566; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
567; AVX512-NEXT:    vpermd %zmm2, %zmm3, %zmm2
568; AVX512-NEXT:    vpmovd2m %zmm2, %k1
569; AVX512-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
570; AVX512-NEXT:    vmovaps %zmm1, (%rsi)
571; AVX512-NEXT:    vzeroupper
572; AVX512-NEXT:    retq
573;
574; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1:
575; AVX512NOTDQ:       # %bb.0:
576; AVX512NOTDQ-NEXT:    kmovw 6(%rdi), %k1
577; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
578; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
579; AVX512NOTDQ-NEXT:    vpermd %zmm2, %zmm3, %zmm2
580; AVX512NOTDQ-NEXT:    vptestmd %zmm2, %zmm2, %k1
581; AVX512NOTDQ-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
582; AVX512NOTDQ-NEXT:    vmovaps %zmm1, (%rsi)
583; AVX512NOTDQ-NEXT:    vzeroupper
584; AVX512NOTDQ-NEXT:    retq
585    %d0 = load <64 x i1>, <64 x i1>* %a0
586    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
587    %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
588    store <16 x float> %d2, <16 x float>* %a3
589    ret void
590}
591define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
592; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
593; AVX512:       # %bb.0:
594; AVX512-NEXT:    kmovb (%rdi), %k0
595; AVX512-NEXT:    kshiftrb $1, %k0, %k0
596; AVX512-NEXT:    kshiftlb $7, %k0, %k0
597; AVX512-NEXT:    kshiftrb $7, %k0, %k0
598; AVX512-NEXT:    kmovb %k0, (%rsi)
599; AVX512-NEXT:    retq
600;
601; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
602; AVX512NOTDQ:       # %bb.0:
603; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
604; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
605; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
606; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
607; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
608; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
609; AVX512NOTDQ-NEXT:    retq
610    %d0 = load <2 x i1>, <2 x i1>* %a0
611    %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
612    store <1 x i1> %d1, <1 x i1>* %a1
613    ret void
614}
615define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
616; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
617; AVX512:       # %bb.0:
618; AVX512-NEXT:    movb (%rdi), %al
619; AVX512-NEXT:    shrb %al
620; AVX512-NEXT:    xorl %ecx, %ecx
621; AVX512-NEXT:    testb $1, %al
622; AVX512-NEXT:    movl $255, %eax
623; AVX512-NEXT:    cmovel %ecx, %eax
624; AVX512-NEXT:    kmovd %eax, %k0
625; AVX512-NEXT:    kshiftrb $1, %k0, %k0
626; AVX512-NEXT:    kshiftlb $7, %k0, %k0
627; AVX512-NEXT:    kshiftrb $7, %k0, %k0
628; AVX512-NEXT:    kmovb %k0, (%rsi)
629; AVX512-NEXT:    retq
630;
631; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
632; AVX512NOTDQ:       # %bb.0:
633; AVX512NOTDQ-NEXT:    movb (%rdi), %al
634; AVX512NOTDQ-NEXT:    shrb %al
635; AVX512NOTDQ-NEXT:    xorl %ecx, %ecx
636; AVX512NOTDQ-NEXT:    testb $1, %al
637; AVX512NOTDQ-NEXT:    movl $255, %eax
638; AVX512NOTDQ-NEXT:    cmovel %ecx, %eax
639; AVX512NOTDQ-NEXT:    kmovd %eax, %k0
640; AVX512NOTDQ-NEXT:    kshiftrw $1, %k0, %k0
641; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
642; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
643; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
644; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
645; AVX512NOTDQ-NEXT:    retq
646    %d0 = load <3 x i1>, <3 x i1>* %a0
647    %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
648    store <1 x i1> %d1, <1 x i1>* %a1
649    ret void
650}
651define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
652; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
653; AVX512:       # %bb.0:
654; AVX512-NEXT:    xorl %eax, %eax
655; AVX512-NEXT:    testb $4, (%rdi)
656; AVX512-NEXT:    movl $255, %ecx
657; AVX512-NEXT:    cmovel %eax, %ecx
658; AVX512-NEXT:    kmovd %ecx, %k0
659; AVX512-NEXT:    kshiftrb $2, %k0, %k0
660; AVX512-NEXT:    kshiftlb $7, %k0, %k0
661; AVX512-NEXT:    kshiftrb $7, %k0, %k0
662; AVX512-NEXT:    kmovb %k0, (%rsi)
663; AVX512-NEXT:    retq
664;
665; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
666; AVX512NOTDQ:       # %bb.0:
667; AVX512NOTDQ-NEXT:    xorl %eax, %eax
668; AVX512NOTDQ-NEXT:    testb $4, (%rdi)
669; AVX512NOTDQ-NEXT:    movl $255, %ecx
670; AVX512NOTDQ-NEXT:    cmovel %eax, %ecx
671; AVX512NOTDQ-NEXT:    kmovd %ecx, %k0
672; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
673; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
674; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
675; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
676; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
677; AVX512NOTDQ-NEXT:    retq
678    %d0 = load <3 x i1>, <3 x i1>* %a0
679    %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
680    store <1 x i1> %d1, <1 x i1>* %a1
681    ret void
682}
683define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
684; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
685; AVX512:       # %bb.0:
686; AVX512-NEXT:    kmovb (%rdi), %k0
687; AVX512-NEXT:    kshiftrb $2, %k0, %k0
688; AVX512-NEXT:    kshiftlb $7, %k0, %k0
689; AVX512-NEXT:    kshiftrb $7, %k0, %k0
690; AVX512-NEXT:    kmovb %k0, (%rsi)
691; AVX512-NEXT:    retq
692;
693; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
694; AVX512NOTDQ:       # %bb.0:
695; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
696; AVX512NOTDQ-NEXT:    kshiftrw $2, %k0, %k0
697; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
698; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
699; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
700; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
701; AVX512NOTDQ-NEXT:    retq
702    %d0 = load <4 x i1>, <4 x i1>* %a0
703    %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
704    store <1 x i1> %d1, <1 x i1>* %a1
705    ret void
706}
707define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
708; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
709; AVX512:       # %bb.0:
710; AVX512-NEXT:    kmovb (%rdi), %k0
711; AVX512-NEXT:    kshiftrb $3, %k0, %k0
712; AVX512-NEXT:    kshiftlb $7, %k0, %k0
713; AVX512-NEXT:    kshiftrb $7, %k0, %k0
714; AVX512-NEXT:    kmovb %k0, (%rsi)
715; AVX512-NEXT:    retq
716;
717; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
718; AVX512NOTDQ:       # %bb.0:
719; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
720; AVX512NOTDQ-NEXT:    kshiftrw $3, %k0, %k0
721; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
722; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
723; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
724; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
725; AVX512NOTDQ-NEXT:    retq
726    %d0 = load <4 x i1>, <4 x i1>* %a0
727    %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
728    store <1 x i1> %d1, <1 x i1>* %a1
729    ret void
730}
731define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
732; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
733; AVX512:       # %bb.0:
734; AVX512-NEXT:    kmovb (%rdi), %k0
735; AVX512-NEXT:    kshiftrb $4, %k0, %k0
736; AVX512-NEXT:    kshiftlb $7, %k0, %k0
737; AVX512-NEXT:    kshiftrb $7, %k0, %k0
738; AVX512-NEXT:    kmovb %k0, (%rsi)
739; AVX512-NEXT:    retq
740;
741; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
742; AVX512NOTDQ:       # %bb.0:
743; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
744; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k0
745; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
746; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
747; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
748; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
749; AVX512NOTDQ-NEXT:    retq
750    %d0 = load <8 x i1>, <8 x i1>* %a0
751    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
752    store <1 x i1> %d1, <1 x i1>* %a1
753    ret void
754}
755define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
756; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
757; AVX512:       # %bb.0:
758; AVX512-NEXT:    kmovb (%rdi), %k0
759; AVX512-NEXT:    kshiftrb $4, %k0, %k0
760; AVX512-NEXT:    vpmovm2q %k0, %xmm0
761; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
762; AVX512-NEXT:    vpmovq2m %xmm0, %k0
763; AVX512-NEXT:    kmovb %k0, (%rsi)
764; AVX512-NEXT:    retq
765;
766; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
767; AVX512NOTDQ:       # %bb.0:
768; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
769; AVX512NOTDQ-NEXT:    kshiftrw $4, %k0, %k1
770; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
771; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
772; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
773; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
774; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
775; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
776; AVX512NOTDQ-NEXT:    retq
777    %d0 = load <8 x i1>, <8 x i1>* %a0
778    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
779    store <2 x i1> %d1, <2 x i1>* %a1
780    ret void
781}
782define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
783; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
784; AVX512:       # %bb.0:
785; AVX512-NEXT:    kmovb (%rdi), %k0
786; AVX512-NEXT:    kshiftrb $7, %k0, %k0
787; AVX512-NEXT:    kshiftlb $7, %k0, %k0
788; AVX512-NEXT:    kshiftrb $7, %k0, %k0
789; AVX512-NEXT:    kmovb %k0, (%rsi)
790; AVX512-NEXT:    retq
791;
792; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
793; AVX512NOTDQ:       # %bb.0:
794; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
795; AVX512NOTDQ-NEXT:    kshiftrw $7, %k0, %k0
796; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
797; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
798; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
799; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
800; AVX512NOTDQ-NEXT:    retq
801    %d0 = load <8 x i1>, <8 x i1>* %a0
802    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
803    store <1 x i1> %d1, <1 x i1>* %a1
804    ret void
805}
806define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
807; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
808; AVX512:       # %bb.0:
809; AVX512-NEXT:    kmovb (%rdi), %k0
810; AVX512-NEXT:    kshiftrb $6, %k0, %k0
811; AVX512-NEXT:    vpmovm2q %k0, %xmm0
812; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
813; AVX512-NEXT:    vpmovq2m %xmm0, %k0
814; AVX512-NEXT:    kmovb %k0, (%rsi)
815; AVX512-NEXT:    retq
816;
817; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
818; AVX512NOTDQ:       # %bb.0:
819; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
820; AVX512NOTDQ-NEXT:    kshiftrw $6, %k0, %k1
821; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
822; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
823; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
824; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
825; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
826; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
827; AVX512NOTDQ-NEXT:    retq
828    %d0 = load <8 x i1>, <8 x i1>* %a0
829    %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
830    store <2 x i1> %d1, <2 x i1>* %a1
831    ret void
832}
833define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
834; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
835; AVX512:       # %bb.0:
836; AVX512-NEXT:    kmovw (%rdi), %k0
837; AVX512-NEXT:    kshiftrw $8, %k0, %k0
838; AVX512-NEXT:    kshiftlb $7, %k0, %k0
839; AVX512-NEXT:    kshiftrb $7, %k0, %k0
840; AVX512-NEXT:    kmovb %k0, (%rsi)
841; AVX512-NEXT:    retq
842;
843; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
844; AVX512NOTDQ:       # %bb.0:
845; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
846; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k0
847; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
848; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
849; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
850; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
851; AVX512NOTDQ-NEXT:    retq
852    %d0 = load <16 x i1>, <16 x i1>* %a0
853    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
854    store <1 x i1> %d1, <1 x i1>* %a1
855    ret void
856}
857define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
858; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
859; AVX512:       # %bb.0:
860; AVX512-NEXT:    kmovw (%rdi), %k0
861; AVX512-NEXT:    kshiftrw $8, %k0, %k0
862; AVX512-NEXT:    vpmovm2q %k0, %xmm0
863; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
864; AVX512-NEXT:    vpmovq2m %xmm0, %k0
865; AVX512-NEXT:    kmovb %k0, (%rsi)
866; AVX512-NEXT:    retq
867;
868; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
869; AVX512NOTDQ:       # %bb.0:
870; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
871; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
872; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
873; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
874; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
875; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
876; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
877; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
878; AVX512NOTDQ-NEXT:    retq
879    %d0 = load <16 x i1>, <16 x i1>* %a0
880    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
881    store <2 x i1> %d1, <2 x i1>* %a1
882    ret void
883}
884define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
885; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
886; AVX512:       # %bb.0:
887; AVX512-NEXT:    kmovw (%rdi), %k0
888; AVX512-NEXT:    kshiftrw $8, %k0, %k0
889; AVX512-NEXT:    vpmovm2d %k0, %xmm0
890; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
891; AVX512-NEXT:    vpmovd2m %xmm0, %k0
892; AVX512-NEXT:    kmovb %k0, (%rsi)
893; AVX512-NEXT:    retq
894;
895; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
896; AVX512NOTDQ:       # %bb.0:
897; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
898; AVX512NOTDQ-NEXT:    kshiftrw $8, %k0, %k1
899; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
900; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
901; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
902; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
903; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
904; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
905; AVX512NOTDQ-NEXT:    retq
906    %d0 = load <16 x i1>, <16 x i1>* %a0
907    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
908    store <4 x i1> %d1, <4 x i1>* %a1
909    ret void
910}
911define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
912; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
913; AVX512:       # %bb.0:
914; AVX512-NEXT:    kmovw (%rdi), %k0
915; AVX512-NEXT:    kshiftrw $15, %k0, %k0
916; AVX512-NEXT:    kshiftlb $7, %k0, %k0
917; AVX512-NEXT:    kshiftrb $7, %k0, %k0
918; AVX512-NEXT:    kmovb %k0, (%rsi)
919; AVX512-NEXT:    retq
920;
921; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
922; AVX512NOTDQ:       # %bb.0:
923; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
924; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
925; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
926; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
927; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
928; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
929; AVX512NOTDQ-NEXT:    retq
930    %d0 = load <16 x i1>, <16 x i1>* %a0
931    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
932    store <1 x i1> %d1, <1 x i1>* %a1
933    ret void
934}
935define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
936; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
937; AVX512:       # %bb.0:
938; AVX512-NEXT:    kmovw (%rdi), %k0
939; AVX512-NEXT:    kshiftrw $14, %k0, %k0
940; AVX512-NEXT:    vpmovm2q %k0, %xmm0
941; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
942; AVX512-NEXT:    vpmovq2m %xmm0, %k0
943; AVX512-NEXT:    kmovb %k0, (%rsi)
944; AVX512-NEXT:    retq
945;
946; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
947; AVX512NOTDQ:       # %bb.0:
948; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
949; AVX512NOTDQ-NEXT:    kshiftrw $14, %k0, %k1
950; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
951; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
952; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
953; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
954; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
955; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
956; AVX512NOTDQ-NEXT:    retq
957    %d0 = load <16 x i1>, <16 x i1>* %a0
958    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
959    store <2 x i1> %d1, <2 x i1>* %a1
960    ret void
961}
962define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
963; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
964; AVX512:       # %bb.0:
965; AVX512-NEXT:    kmovw (%rdi), %k0
966; AVX512-NEXT:    kshiftrw $12, %k0, %k0
967; AVX512-NEXT:    vpmovm2d %k0, %xmm0
968; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
969; AVX512-NEXT:    vpmovd2m %xmm0, %k0
970; AVX512-NEXT:    kmovb %k0, (%rsi)
971; AVX512-NEXT:    retq
972;
973; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
974; AVX512NOTDQ:       # %bb.0:
975; AVX512NOTDQ-NEXT:    kmovw (%rdi), %k0
976; AVX512NOTDQ-NEXT:    kshiftrw $12, %k0, %k1
977; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
978; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
979; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
980; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
981; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
982; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
983; AVX512NOTDQ-NEXT:    retq
984    %d0 = load <16 x i1>, <16 x i1>* %a0
985    %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
986    store <4 x i1> %d1, <4 x i1>* %a1
987    ret void
988}
989define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
990; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
991; AVX512:       # %bb.0:
992; AVX512-NEXT:    kmovd (%rdi), %k0
993; AVX512-NEXT:    kshiftrd $16, %k0, %k0
994; AVX512-NEXT:    kshiftlb $7, %k0, %k0
995; AVX512-NEXT:    kshiftrb $7, %k0, %k0
996; AVX512-NEXT:    kmovb %k0, (%rsi)
997; AVX512-NEXT:    retq
998;
999; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
1000; AVX512NOTDQ:       # %bb.0:
1001; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1002; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k0
1003; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1004; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1005; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1006; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1007; AVX512NOTDQ-NEXT:    retq
1008    %d0 = load <32 x i1>, <32 x i1>* %a0
1009    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
1010    store <1 x i1> %d1, <1 x i1>* %a1
1011    ret void
1012}
1013define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1014; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
1015; AVX512:       # %bb.0:
1016; AVX512-NEXT:    kmovd (%rdi), %k0
1017; AVX512-NEXT:    kshiftrd $16, %k0, %k0
1018; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1019; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
1020; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1021; AVX512-NEXT:    kmovb %k0, (%rsi)
1022; AVX512-NEXT:    retq
1023;
1024; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
1025; AVX512NOTDQ:       # %bb.0:
1026; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1027; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
1028; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1029; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1030; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
1031; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1032; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1033; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1034; AVX512NOTDQ-NEXT:    retq
1035    %d0 = load <32 x i1>, <32 x i1>* %a0
1036    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
1037    store <2 x i1> %d1, <2 x i1>* %a1
1038    ret void
1039}
1040define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1041; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
1042; AVX512:       # %bb.0:
1043; AVX512-NEXT:    kmovd (%rdi), %k0
1044; AVX512-NEXT:    kshiftrd $16, %k0, %k0
1045; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1046; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
1047; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1048; AVX512-NEXT:    kmovb %k0, (%rsi)
1049; AVX512-NEXT:    retq
1050;
1051; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
1052; AVX512NOTDQ:       # %bb.0:
1053; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1054; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1
1055; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1056; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1057; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
1058; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1059; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1060; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1061; AVX512NOTDQ-NEXT:    retq
1062    %d0 = load <32 x i1>, <32 x i1>* %a0
1063    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
1064    store <4 x i1> %d1, <4 x i1>* %a1
1065    ret void
1066}
1067define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1068; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
1069; AVX512:       # %bb.0:
1070; AVX512-NEXT:    kmovb 2(%rdi), %k0
1071; AVX512-NEXT:    vpmovm2d %k0, %ymm0
1072; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
1073; AVX512-NEXT:    vpmovd2m %ymm0, %k0
1074; AVX512-NEXT:    kmovb %k0, (%rsi)
1075; AVX512-NEXT:    vzeroupper
1076; AVX512-NEXT:    retq
1077;
1078; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
1079; AVX512NOTDQ:       # %bb.0:
1080; AVX512NOTDQ-NEXT:    kmovw 2(%rdi), %k1
1081; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1082; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1083; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
1084; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
1085; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1086; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1087; AVX512NOTDQ-NEXT:    vzeroupper
1088; AVX512NOTDQ-NEXT:    retq
1089    %d0 = load <32 x i1>, <32 x i1>* %a0
1090    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
1091    store <8 x i1> %d1, <8 x i1>* %a1
1092    ret void
1093}
1094define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
1095; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
1096; AVX512:       # %bb.0:
1097; AVX512-NEXT:    kmovd (%rdi), %k0
1098; AVX512-NEXT:    kshiftrd $31, %k0, %k0
1099; AVX512-NEXT:    kshiftlb $7, %k0, %k0
1100; AVX512-NEXT:    kshiftrb $7, %k0, %k0
1101; AVX512-NEXT:    kmovb %k0, (%rsi)
1102; AVX512-NEXT:    retq
1103;
1104; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
1105; AVX512NOTDQ:       # %bb.0:
1106; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1107; AVX512NOTDQ-NEXT:    kshiftrd $31, %k0, %k0
1108; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1109; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1110; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1111; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1112; AVX512NOTDQ-NEXT:    retq
1113    %d0 = load <32 x i1>, <32 x i1>* %a0
1114    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
1115    store <1 x i1> %d1, <1 x i1>* %a1
1116    ret void
1117}
1118define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1119; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
1120; AVX512:       # %bb.0:
1121; AVX512-NEXT:    kmovd (%rdi), %k0
1122; AVX512-NEXT:    kshiftrd $30, %k0, %k0
1123; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1124; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1125; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1126; AVX512-NEXT:    kmovb %k0, (%rsi)
1127; AVX512-NEXT:    retq
1128;
1129; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
1130; AVX512NOTDQ:       # %bb.0:
1131; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1132; AVX512NOTDQ-NEXT:    kshiftrd $30, %k0, %k1
1133; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1134; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1135; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1136; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1137; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1138; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1139; AVX512NOTDQ-NEXT:    retq
1140    %d0 = load <32 x i1>, <32 x i1>* %a0
1141    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
1142    store <2 x i1> %d1, <2 x i1>* %a1
1143    ret void
1144}
1145define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1146; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
1147; AVX512:       # %bb.0:
1148; AVX512-NEXT:    kmovd (%rdi), %k0
1149; AVX512-NEXT:    kshiftrd $28, %k0, %k0
1150; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1151; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1152; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1153; AVX512-NEXT:    kmovb %k0, (%rsi)
1154; AVX512-NEXT:    retq
1155;
1156; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
1157; AVX512NOTDQ:       # %bb.0:
1158; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0
1159; AVX512NOTDQ-NEXT:    kshiftrd $28, %k0, %k1
1160; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1161; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1162; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1163; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1164; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1165; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1166; AVX512NOTDQ-NEXT:    retq
1167    %d0 = load <32 x i1>, <32 x i1>* %a0
1168    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
1169    store <4 x i1> %d1, <4 x i1>* %a1
1170    ret void
1171}
1172define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1173; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store:
1174; AVX512:       # %bb.0:
1175; AVX512-NEXT:    kmovb 3(%rdi), %k0
1176; AVX512-NEXT:    vpmovm2d %k0, %ymm0
1177; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1178; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1179; AVX512-NEXT:    vpmovd2m %ymm0, %k0
1180; AVX512-NEXT:    kmovb %k0, (%rsi)
1181; AVX512-NEXT:    vzeroupper
1182; AVX512-NEXT:    retq
1183;
1184; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store:
1185; AVX512NOTDQ:       # %bb.0:
1186; AVX512NOTDQ-NEXT:    movzbl 3(%rdi), %eax
1187; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
1188; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1189; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1190; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1191; AVX512NOTDQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1192; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
1193; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1194; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1195; AVX512NOTDQ-NEXT:    vzeroupper
1196; AVX512NOTDQ-NEXT:    retq
1197    %d0 = load <32 x i1>, <32 x i1>* %a0
1198    %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
1199    store <8 x i1> %d1, <8 x i1>* %a1
1200    ret void
1201}
1202define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1203; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
1204; AVX512:       # %bb.0:
1205; AVX512-NEXT:    kmovq (%rdi), %k0
1206; AVX512-NEXT:    kshiftrq $32, %k0, %k0
1207; AVX512-NEXT:    kshiftlb $7, %k0, %k0
1208; AVX512-NEXT:    kshiftrb $7, %k0, %k0
1209; AVX512-NEXT:    kmovb %k0, (%rsi)
1210; AVX512-NEXT:    retq
1211;
1212; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
1213; AVX512NOTDQ:       # %bb.0:
1214; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1215; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k0
1216; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1217; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1218; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1219; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1220; AVX512NOTDQ-NEXT:    retq
1221    %d0 = load <64 x i1>, <64 x i1>* %a0
1222    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
1223    store <1 x i1> %d1, <1 x i1>* %a1
1224    ret void
1225}
1226define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1227; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
1228; AVX512:       # %bb.0:
1229; AVX512-NEXT:    kmovq (%rdi), %k0
1230; AVX512-NEXT:    kshiftrq $32, %k0, %k0
1231; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1232; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
1233; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1234; AVX512-NEXT:    kmovb %k0, (%rsi)
1235; AVX512-NEXT:    retq
1236;
1237; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
1238; AVX512NOTDQ:       # %bb.0:
1239; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1240; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
1241; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1242; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1243; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %xmm0
1244; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1245; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1246; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1247; AVX512NOTDQ-NEXT:    retq
1248    %d0 = load <64 x i1>, <64 x i1>* %a0
1249    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
1250    store <2 x i1> %d1, <2 x i1>* %a1
1251    ret void
1252}
1253define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1254; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
1255; AVX512:       # %bb.0:
1256; AVX512-NEXT:    kmovq (%rdi), %k0
1257; AVX512-NEXT:    kshiftrq $32, %k0, %k0
1258; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1259; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
1260; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1261; AVX512-NEXT:    kmovb %k0, (%rsi)
1262; AVX512-NEXT:    retq
1263;
1264; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
1265; AVX512NOTDQ:       # %bb.0:
1266; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1267; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1
1268; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1269; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1270; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %xmm0
1271; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1272; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1273; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1274; AVX512NOTDQ-NEXT:    retq
1275    %d0 = load <64 x i1>, <64 x i1>* %a0
1276    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
1277    store <4 x i1> %d1, <4 x i1>* %a1
1278    ret void
1279}
1280define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1281; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
1282; AVX512:       # %bb.0:
1283; AVX512-NEXT:    kmovb 4(%rdi), %k0
1284; AVX512-NEXT:    vpmovm2d %k0, %ymm0
1285; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0
1286; AVX512-NEXT:    vpmovd2m %ymm0, %k0
1287; AVX512-NEXT:    kmovb %k0, (%rsi)
1288; AVX512-NEXT:    vzeroupper
1289; AVX512-NEXT:    retq
1290;
1291; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
1292; AVX512NOTDQ:       # %bb.0:
1293; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
1294; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1295; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1296; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0
1297; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
1298; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1299; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1300; AVX512NOTDQ-NEXT:    vzeroupper
1301; AVX512NOTDQ-NEXT:    retq
1302    %d0 = load <64 x i1>, <64 x i1>* %a0
1303    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1304    store <8 x i1> %d1, <8 x i1>* %a1
1305    ret void
1306}
1307define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1308; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
1309; AVX512:       # %bb.0:
1310; AVX512-NEXT:    kmovw 4(%rdi), %k0
1311; AVX512-NEXT:    vpmovm2d %k0, %zmm0
1312; AVX512-NEXT:    vpbroadcastd %xmm0, %zmm0
1313; AVX512-NEXT:    vpmovd2m %zmm0, %k0
1314; AVX512-NEXT:    kmovw %k0, (%rsi)
1315; AVX512-NEXT:    vzeroupper
1316; AVX512-NEXT:    retq
1317;
1318; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
1319; AVX512NOTDQ:       # %bb.0:
1320; AVX512NOTDQ-NEXT:    kmovw 4(%rdi), %k1
1321; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1322; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %zmm0
1323; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
1324; AVX512NOTDQ-NEXT:    kmovw %k0, (%rsi)
1325; AVX512NOTDQ-NEXT:    vzeroupper
1326; AVX512NOTDQ-NEXT:    retq
1327    %d0 = load <64 x i1>, <64 x i1>* %a0
1328    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1329    store <16 x i1> %d1, <16 x i1>* %a1
1330    ret void
1331}
1332define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1333; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
1334; AVX512:       # %bb.0:
1335; AVX512-NEXT:    kmovq (%rdi), %k0
1336; AVX512-NEXT:    kshiftrq $63, %k0, %k0
1337; AVX512-NEXT:    kshiftlb $7, %k0, %k0
1338; AVX512-NEXT:    kshiftrb $7, %k0, %k0
1339; AVX512-NEXT:    kmovb %k0, (%rsi)
1340; AVX512-NEXT:    retq
1341;
1342; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
1343; AVX512NOTDQ:       # %bb.0:
1344; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1345; AVX512NOTDQ-NEXT:    kshiftrq $63, %k0, %k0
1346; AVX512NOTDQ-NEXT:    kshiftlw $15, %k0, %k0
1347; AVX512NOTDQ-NEXT:    kshiftrw $15, %k0, %k0
1348; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1349; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1350; AVX512NOTDQ-NEXT:    retq
1351    %d0 = load <64 x i1>, <64 x i1>* %a0
1352    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
1353    store <1 x i1> %d1, <1 x i1>* %a1
1354    ret void
1355}
1356define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1357; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
1358; AVX512:       # %bb.0:
1359; AVX512-NEXT:    kmovq (%rdi), %k0
1360; AVX512-NEXT:    kshiftrq $62, %k0, %k0
1361; AVX512-NEXT:    vpmovm2q %k0, %xmm0
1362; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1363; AVX512-NEXT:    vpmovq2m %xmm0, %k0
1364; AVX512-NEXT:    kmovb %k0, (%rsi)
1365; AVX512-NEXT:    retq
1366;
1367; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
1368; AVX512NOTDQ:       # %bb.0:
1369; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1370; AVX512NOTDQ-NEXT:    kshiftrq $62, %k0, %k1
1371; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1372; AVX512NOTDQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1373; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1374; AVX512NOTDQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
1375; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1376; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1377; AVX512NOTDQ-NEXT:    retq
1378    %d0 = load <64 x i1>, <64 x i1>* %a0
1379    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
1380    store <2 x i1> %d1, <2 x i1>* %a1
1381    ret void
1382}
1383define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1384; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
1385; AVX512:       # %bb.0:
1386; AVX512-NEXT:    kmovq (%rdi), %k0
1387; AVX512-NEXT:    kshiftrq $60, %k0, %k0
1388; AVX512-NEXT:    vpmovm2d %k0, %xmm0
1389; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1390; AVX512-NEXT:    vpmovd2m %xmm0, %k0
1391; AVX512-NEXT:    kmovb %k0, (%rsi)
1392; AVX512-NEXT:    retq
1393;
1394; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
1395; AVX512NOTDQ:       # %bb.0:
1396; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0
1397; AVX512NOTDQ-NEXT:    kshiftrq $60, %k0, %k1
1398; AVX512NOTDQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1399; AVX512NOTDQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1400; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1401; AVX512NOTDQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
1402; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1403; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1404; AVX512NOTDQ-NEXT:    retq
1405    %d0 = load <64 x i1>, <64 x i1>* %a0
1406    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
1407    store <4 x i1> %d1, <4 x i1>* %a1
1408    ret void
1409}
1410define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1411; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store:
1412; AVX512:       # %bb.0:
1413; AVX512-NEXT:    kmovb 7(%rdi), %k0
1414; AVX512-NEXT:    vpmovm2d %k0, %ymm0
1415; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1416; AVX512-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1417; AVX512-NEXT:    vpmovd2m %ymm0, %k0
1418; AVX512-NEXT:    kmovb %k0, (%rsi)
1419; AVX512-NEXT:    vzeroupper
1420; AVX512-NEXT:    retq
1421;
1422; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store:
1423; AVX512NOTDQ:       # %bb.0:
1424; AVX512NOTDQ-NEXT:    movzbl 7(%rdi), %eax
1425; AVX512NOTDQ-NEXT:    kmovd %eax, %k1
1426; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
1427; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1428; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
1429; AVX512NOTDQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1430; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0
1431; AVX512NOTDQ-NEXT:    kmovd %k0, %eax
1432; AVX512NOTDQ-NEXT:    movb %al, (%rsi)
1433; AVX512NOTDQ-NEXT:    vzeroupper
1434; AVX512NOTDQ-NEXT:    retq
1435    %d0 = load <64 x i1>, <64 x i1>* %a0
1436    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1437    store <8 x i1> %d1, <8 x i1>* %a1
1438    ret void
1439}
1440define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1441; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store:
1442; AVX512:       # %bb.0:
1443; AVX512-NEXT:    kmovw 6(%rdi), %k0
1444; AVX512-NEXT:    vpmovm2d %k0, %zmm0
1445; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1446; AVX512-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1447; AVX512-NEXT:    vpmovd2m %zmm0, %k0
1448; AVX512-NEXT:    kmovw %k0, (%rsi)
1449; AVX512-NEXT:    vzeroupper
1450; AVX512-NEXT:    retq
1451;
1452; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store:
1453; AVX512NOTDQ:       # %bb.0:
1454; AVX512NOTDQ-NEXT:    kmovw 6(%rdi), %k1
1455; AVX512NOTDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1456; AVX512NOTDQ-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1457; AVX512NOTDQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
1458; AVX512NOTDQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
1459; AVX512NOTDQ-NEXT:    kmovw %k0, (%rsi)
1460; AVX512NOTDQ-NEXT:    vzeroupper
1461; AVX512NOTDQ-NEXT:    retq
1462    %d0 = load <64 x i1>, <64 x i1>* %a0
1463    %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1464    store <16 x i1> %d1, <16 x i1>* %a1
1465    ret void
1466}
1467
1468