• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s
3
4; Test that we can unfold constant pool loads when we're using avx512's
5; ability to fold a broadcast load into an operation.
6
7define void @bcast_unfold_add_v16i32(i32* %arg) {
8; CHECK-LABEL: bcast_unfold_add_v16i32:
9; CHECK:       # %bb.0: # %bb
10; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
11; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
12; CHECK-NEXT:    .p2align 4, 0x90
13; CHECK-NEXT:  .LBB0_1: # %bb2
14; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
15; CHECK-NEXT:    vpaddd 4096(%rdi,%rax), %zmm0, %zmm1
16; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
17; CHECK-NEXT:    addq $64, %rax
18; CHECK-NEXT:    jne .LBB0_1
19; CHECK-NEXT:  # %bb.2: # %bb10
20; CHECK-NEXT:    vzeroupper
21; CHECK-NEXT:    retq
22bb:
23  br label %bb2
24
25bb2:                                              ; preds = %bb2, %bb
26  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
27  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
28  %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
29  %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
30  %tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
31  %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
32  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
33  %tmp8 = add i64 %tmp, 16
34  %tmp9 = icmp eq i64 %tmp8, 1024
35  br i1 %tmp9, label %bb10, label %bb2
36
37bb10:                                             ; preds = %bb2
38  ret void
39}
40
41define void @bcast_unfold_add_v8i32(i32* %arg) {
42; CHECK-LABEL: bcast_unfold_add_v8i32:
43; CHECK:       # %bb.0: # %bb
44; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
45; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
46; CHECK-NEXT:    .p2align 4, 0x90
47; CHECK-NEXT:  .LBB1_1: # %bb2
48; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
49; CHECK-NEXT:    vpaddd 4096(%rdi,%rax), %ymm0, %ymm1
50; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
51; CHECK-NEXT:    addq $32, %rax
52; CHECK-NEXT:    jne .LBB1_1
53; CHECK-NEXT:  # %bb.2: # %bb10
54; CHECK-NEXT:    vzeroupper
55; CHECK-NEXT:    retq
56bb:
57  br label %bb2
58
59bb2:                                              ; preds = %bb2, %bb
60  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
61  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
62  %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
63  %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
64  %tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
65  %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
66  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
67  %tmp8 = add i64 %tmp, 8
68  %tmp9 = icmp eq i64 %tmp8, 1024
69  br i1 %tmp9, label %bb10, label %bb2
70
71bb10:                                             ; preds = %bb2
72  ret void
73}
74
75define void @bcast_unfold_add_v4i32(i32* %arg) {
76; CHECK-LABEL: bcast_unfold_add_v4i32:
77; CHECK:       # %bb.0: # %bb
78; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
79; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
80; CHECK-NEXT:    .p2align 4, 0x90
81; CHECK-NEXT:  .LBB2_1: # %bb2
82; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
83; CHECK-NEXT:    vpaddd 4096(%rdi,%rax), %xmm0, %xmm1
84; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
85; CHECK-NEXT:    addq $16, %rax
86; CHECK-NEXT:    jne .LBB2_1
87; CHECK-NEXT:  # %bb.2: # %bb10
88; CHECK-NEXT:    retq
89bb:
90  br label %bb2
91
92bb2:                                              ; preds = %bb2, %bb
93  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
94  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
95  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
96  %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
97  %tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2>
98  %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
99  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
100  %tmp8 = add i64 %tmp, 4
101  %tmp9 = icmp eq i64 %tmp8, 1024
102  br i1 %tmp9, label %bb10, label %bb2
103
104bb10:                                             ; preds = %bb2
105  ret void
106}
107
108define void @bcast_unfold_add_v8i64(i64* %arg) {
109; CHECK-LABEL: bcast_unfold_add_v8i64:
110; CHECK:       # %bb.0: # %bb
111; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
112; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
113; CHECK-NEXT:    .p2align 4, 0x90
114; CHECK-NEXT:  .LBB3_1: # %bb2
115; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
116; CHECK-NEXT:    vpaddq 8192(%rdi,%rax), %zmm0, %zmm1
117; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
118; CHECK-NEXT:    addq $64, %rax
119; CHECK-NEXT:    jne .LBB3_1
120; CHECK-NEXT:  # %bb.2: # %bb10
121; CHECK-NEXT:    vzeroupper
122; CHECK-NEXT:    retq
123bb:
124  br label %bb2
125
126bb2:                                              ; preds = %bb2, %bb
127  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
128  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
129  %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
130  %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
131  %tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
132  %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
133  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
134  %tmp8 = add i64 %tmp, 8
135  %tmp9 = icmp eq i64 %tmp8, 1024
136  br i1 %tmp9, label %bb10, label %bb2
137
138bb10:                                             ; preds = %bb2
139  ret void
140}
141
142define void @bcast_unfold_add_v4i64(i64* %arg) {
143; CHECK-LABEL: bcast_unfold_add_v4i64:
144; CHECK:       # %bb.0: # %bb
145; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
146; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
147; CHECK-NEXT:    .p2align 4, 0x90
148; CHECK-NEXT:  .LBB4_1: # %bb2
149; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
150; CHECK-NEXT:    vpaddq 8192(%rdi,%rax), %ymm0, %ymm1
151; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
152; CHECK-NEXT:    addq $32, %rax
153; CHECK-NEXT:    jne .LBB4_1
154; CHECK-NEXT:  # %bb.2: # %bb10
155; CHECK-NEXT:    vzeroupper
156; CHECK-NEXT:    retq
157bb:
158  br label %bb2
159
160bb2:                                              ; preds = %bb2, %bb
161  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
162  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
163  %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
164  %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
165  %tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2>
166  %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
167  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
168  %tmp8 = add i64 %tmp, 4
169  %tmp9 = icmp eq i64 %tmp8, 1024
170  br i1 %tmp9, label %bb10, label %bb2
171
172bb10:                                             ; preds = %bb2
173  ret void
174}
175
176define void @bcast_unfold_add_v2i64(i64* %arg) {
177; CHECK-LABEL: bcast_unfold_add_v2i64:
178; CHECK:       # %bb.0: # %bb
179; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
180; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,2]
181; CHECK-NEXT:    .p2align 4, 0x90
182; CHECK-NEXT:  .LBB5_1: # %bb2
183; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
184; CHECK-NEXT:    vpaddq 8192(%rdi,%rax), %xmm0, %xmm1
185; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
186; CHECK-NEXT:    addq $16, %rax
187; CHECK-NEXT:    jne .LBB5_1
188; CHECK-NEXT:  # %bb.2: # %bb10
189; CHECK-NEXT:    retq
190bb:
191  br label %bb2
192
193bb2:                                              ; preds = %bb2, %bb
194  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
195  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
196  %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
197  %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
198  %tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2>
199  %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
200  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
201  %tmp8 = add i64 %tmp, 2
202  %tmp9 = icmp eq i64 %tmp8, 1024
203  br i1 %tmp9, label %bb10, label %bb2
204
205bb10:                                             ; preds = %bb2
206  ret void
207}
208
209define void @bcast_unfold_mul_v16i32(i32* %arg) {
210; CHECK-LABEL: bcast_unfold_mul_v16i32:
211; CHECK:       # %bb.0: # %bb
212; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
213; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
214; CHECK-NEXT:    .p2align 4, 0x90
215; CHECK-NEXT:  .LBB6_1: # %bb2
216; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
217; CHECK-NEXT:    vpmulld 4096(%rdi,%rax), %zmm0, %zmm1
218; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
219; CHECK-NEXT:    addq $64, %rax
220; CHECK-NEXT:    jne .LBB6_1
221; CHECK-NEXT:  # %bb.2: # %bb10
222; CHECK-NEXT:    vzeroupper
223; CHECK-NEXT:    retq
224bb:
225  br label %bb2
226
227bb2:                                              ; preds = %bb2, %bb
228  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
229  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
230  %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
231  %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
232  %tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
233  %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
234  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
235  %tmp8 = add i64 %tmp, 16
236  %tmp9 = icmp eq i64 %tmp8, 1024
237  br i1 %tmp9, label %bb10, label %bb2
238
239bb10:                                             ; preds = %bb2
240  ret void
241}
242
243define void @bcast_unfold_mul_v8i32(i32* %arg) {
244; CHECK-LABEL: bcast_unfold_mul_v8i32:
245; CHECK:       # %bb.0: # %bb
246; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
247; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
248; CHECK-NEXT:    .p2align 4, 0x90
249; CHECK-NEXT:  .LBB7_1: # %bb2
250; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
251; CHECK-NEXT:    vpmulld 4096(%rdi,%rax), %ymm0, %ymm1
252; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
253; CHECK-NEXT:    addq $32, %rax
254; CHECK-NEXT:    jne .LBB7_1
255; CHECK-NEXT:  # %bb.2: # %bb10
256; CHECK-NEXT:    vzeroupper
257; CHECK-NEXT:    retq
258bb:
259  br label %bb2
260
261bb2:                                              ; preds = %bb2, %bb
262  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
263  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
264  %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
265  %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
266  %tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
267  %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
268  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
269  %tmp8 = add i64 %tmp, 8
270  %tmp9 = icmp eq i64 %tmp8, 1024
271  br i1 %tmp9, label %bb10, label %bb2
272
273bb10:                                             ; preds = %bb2
274  ret void
275}
276
277define void @bcast_unfold_mul_v4i32(i32* %arg) {
278; CHECK-LABEL: bcast_unfold_mul_v4i32:
279; CHECK:       # %bb.0: # %bb
280; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
281; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3]
282; CHECK-NEXT:    .p2align 4, 0x90
283; CHECK-NEXT:  .LBB8_1: # %bb2
284; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
285; CHECK-NEXT:    vpmulld 4096(%rdi,%rax), %xmm0, %xmm1
286; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
287; CHECK-NEXT:    addq $16, %rax
288; CHECK-NEXT:    jne .LBB8_1
289; CHECK-NEXT:  # %bb.2: # %bb10
290; CHECK-NEXT:    retq
291bb:
292  br label %bb2
293
294bb2:                                              ; preds = %bb2, %bb
295  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
296  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
297  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
298  %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
299  %tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
300  %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
301  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
302  %tmp8 = add i64 %tmp, 4
303  %tmp9 = icmp eq i64 %tmp8, 1024
304  br i1 %tmp9, label %bb10, label %bb2
305
306bb10:                                             ; preds = %bb2
307  ret void
308}
309
310define void @bcast_unfold_mul_v8i64(i64* %arg) {
311; CHECK-LABEL: bcast_unfold_mul_v8i64:
312; CHECK:       # %bb.0: # %bb
313; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
314; CHECK-NEXT:    .p2align 4, 0x90
315; CHECK-NEXT:  .LBB9_1: # %bb2
316; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
317; CHECK-NEXT:    vmovdqu64 8192(%rdi,%rax), %zmm0
318; CHECK-NEXT:    vpaddq %zmm0, %zmm0, %zmm1
319; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
320; CHECK-NEXT:    vmovdqu64 %zmm0, 8192(%rdi,%rax)
321; CHECK-NEXT:    addq $64, %rax
322; CHECK-NEXT:    jne .LBB9_1
323; CHECK-NEXT:  # %bb.2: # %bb10
324; CHECK-NEXT:    vzeroupper
325; CHECK-NEXT:    retq
326bb:
327  br label %bb2
328
329bb2:                                              ; preds = %bb2, %bb
330  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
331  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
332  %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
333  %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
334  %tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
335  %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
336  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
337  %tmp8 = add i64 %tmp, 8
338  %tmp9 = icmp eq i64 %tmp8, 1024
339  br i1 %tmp9, label %bb10, label %bb2
340
341bb10:                                             ; preds = %bb2
342  ret void
343}
344
345define void @bcast_unfold_mul_v4i64(i64* %arg) {
346; CHECK-LABEL: bcast_unfold_mul_v4i64:
347; CHECK:       # %bb.0: # %bb
348; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
349; CHECK-NEXT:    .p2align 4, 0x90
350; CHECK-NEXT:  .LBB10_1: # %bb2
351; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
352; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm0
353; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm1
354; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
355; CHECK-NEXT:    vmovdqu %ymm0, 8192(%rdi,%rax)
356; CHECK-NEXT:    addq $32, %rax
357; CHECK-NEXT:    jne .LBB10_1
358; CHECK-NEXT:  # %bb.2: # %bb10
359; CHECK-NEXT:    vzeroupper
360; CHECK-NEXT:    retq
361bb:
362  br label %bb2
363
364bb2:                                              ; preds = %bb2, %bb
365  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
366  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
367  %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
368  %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
369  %tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
370  %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
371  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
372  %tmp8 = add i64 %tmp, 4
373  %tmp9 = icmp eq i64 %tmp8, 1024
374  br i1 %tmp9, label %bb10, label %bb2
375
376bb10:                                             ; preds = %bb2
377  ret void
378}
379
380define void @bcast_unfold_mul_v2i64(i64* %arg) {
381; CHECK-LABEL: bcast_unfold_mul_v2i64:
382; CHECK:       # %bb.0: # %bb
383; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
384; CHECK-NEXT:    .p2align 4, 0x90
385; CHECK-NEXT:  .LBB11_1: # %bb2
386; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
387; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %xmm0
388; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm1
389; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
390; CHECK-NEXT:    vmovdqu %xmm0, 8192(%rdi,%rax)
391; CHECK-NEXT:    addq $16, %rax
392; CHECK-NEXT:    jne .LBB11_1
393; CHECK-NEXT:  # %bb.2: # %bb10
394; CHECK-NEXT:    retq
395bb:
396  br label %bb2
397
398bb2:                                              ; preds = %bb2, %bb
399  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
400  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
401  %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
402  %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
403  %tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3>
404  %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
405  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
406  %tmp8 = add i64 %tmp, 2
407  %tmp9 = icmp eq i64 %tmp8, 1024
408  br i1 %tmp9, label %bb10, label %bb2
409
410bb10:                                             ; preds = %bb2
411  ret void
412}
413
414define void @bcast_unfold_or_v16i32(i32* %arg) {
415; CHECK-LABEL: bcast_unfold_or_v16i32:
416; CHECK:       # %bb.0: # %bb
417; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
418; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
419; CHECK-NEXT:    .p2align 4, 0x90
420; CHECK-NEXT:  .LBB12_1: # %bb2
421; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
422; CHECK-NEXT:    vpord 4096(%rdi,%rax), %zmm0, %zmm1
423; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
424; CHECK-NEXT:    addq $64, %rax
425; CHECK-NEXT:    jne .LBB12_1
426; CHECK-NEXT:  # %bb.2: # %bb10
427; CHECK-NEXT:    vzeroupper
428; CHECK-NEXT:    retq
429bb:
430  br label %bb2
431
432bb2:                                              ; preds = %bb2, %bb
433  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
434  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
435  %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
436  %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
437  %tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
438  %tmp7 = bitcast i32* %tmp3 to <16 x i32>*
439  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
440  %tmp8 = add i64 %tmp, 16
441  %tmp9 = icmp eq i64 %tmp8, 1024
442  br i1 %tmp9, label %bb10, label %bb2
443
444bb10:                                             ; preds = %bb2
445  ret void
446}
447
448define void @bcast_unfold_or_v8i32(i32* %arg) {
449; CHECK-LABEL: bcast_unfold_or_v8i32:
450; CHECK:       # %bb.0: # %bb
451; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
452; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3]
453; CHECK-NEXT:    .p2align 4, 0x90
454; CHECK-NEXT:  .LBB13_1: # %bb2
455; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
456; CHECK-NEXT:    vorps 4096(%rdi,%rax), %ymm0, %ymm1
457; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
458; CHECK-NEXT:    addq $32, %rax
459; CHECK-NEXT:    jne .LBB13_1
460; CHECK-NEXT:  # %bb.2: # %bb10
461; CHECK-NEXT:    vzeroupper
462; CHECK-NEXT:    retq
463bb:
464  br label %bb2
465
466bb2:                                              ; preds = %bb2, %bb
467  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
468  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
469  %tmp4 = bitcast i32* %tmp3 to <8 x i32>*
470  %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4
471  %tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
472  %tmp7 = bitcast i32* %tmp3 to <8 x i32>*
473  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
474  %tmp8 = add i64 %tmp, 8
475  %tmp9 = icmp eq i64 %tmp8, 1024
476  br i1 %tmp9, label %bb10, label %bb2
477
478bb10:                                             ; preds = %bb2
479  ret void
480}
481
482define void @bcast_unfold_or_v4i32(i32* %arg) {
483; CHECK-LABEL: bcast_unfold_or_v4i32:
484; CHECK:       # %bb.0: # %bb
485; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
486; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3,3,3,3]
487; CHECK-NEXT:    .p2align 4, 0x90
488; CHECK-NEXT:  .LBB14_1: # %bb2
489; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
490; CHECK-NEXT:    vorps 4096(%rdi,%rax), %xmm0, %xmm1
491; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
492; CHECK-NEXT:    addq $16, %rax
493; CHECK-NEXT:    jne .LBB14_1
494; CHECK-NEXT:  # %bb.2: # %bb10
495; CHECK-NEXT:    retq
496bb:
497  br label %bb2
498
499bb2:                                              ; preds = %bb2, %bb
500  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
501  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
502  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
503  %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4
504  %tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3>
505  %tmp7 = bitcast i32* %tmp3 to <4 x i32>*
506  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
507  %tmp8 = add i64 %tmp, 4
508  %tmp9 = icmp eq i64 %tmp8, 1024
509  br i1 %tmp9, label %bb10, label %bb2
510
511bb10:                                             ; preds = %bb2
512  ret void
513}
514
515define void @bcast_unfold_or_v8i64(i64* %arg) {
516; CHECK-LABEL: bcast_unfold_or_v8i64:
517; CHECK:       # %bb.0: # %bb
518; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
519; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3]
520; CHECK-NEXT:    .p2align 4, 0x90
521; CHECK-NEXT:  .LBB15_1: # %bb2
522; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
523; CHECK-NEXT:    vporq 8192(%rdi,%rax), %zmm0, %zmm1
524; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
525; CHECK-NEXT:    addq $64, %rax
526; CHECK-NEXT:    jne .LBB15_1
527; CHECK-NEXT:  # %bb.2: # %bb10
528; CHECK-NEXT:    vzeroupper
529; CHECK-NEXT:    retq
530bb:
531  br label %bb2
532
533bb2:                                              ; preds = %bb2, %bb
534  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
535  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
536  %tmp4 = bitcast i64* %tmp3 to <8 x i64>*
537  %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8
538  %tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
539  %tmp7 = bitcast i64* %tmp3 to <8 x i64>*
540  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
541  %tmp8 = add i64 %tmp, 8
542  %tmp9 = icmp eq i64 %tmp8, 1024
543  br i1 %tmp9, label %bb10, label %bb2
544
545bb10:                                             ; preds = %bb2
546  ret void
547}
548
549define void @bcast_unfold_or_v4i64(i64* %arg) {
550; CHECK-LABEL: bcast_unfold_or_v4i64:
551; CHECK:       # %bb.0: # %bb
552; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
553; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3]
554; CHECK-NEXT:    .p2align 4, 0x90
555; CHECK-NEXT:  .LBB16_1: # %bb2
556; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
557; CHECK-NEXT:    vorps 8192(%rdi,%rax), %ymm0, %ymm1
558; CHECK-NEXT:    vmovups %ymm1, 8192(%rdi,%rax)
559; CHECK-NEXT:    addq $32, %rax
560; CHECK-NEXT:    jne .LBB16_1
561; CHECK-NEXT:  # %bb.2: # %bb10
562; CHECK-NEXT:    vzeroupper
563; CHECK-NEXT:    retq
564bb:
565  br label %bb2
566
567bb2:                                              ; preds = %bb2, %bb
568  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
569  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
570  %tmp4 = bitcast i64* %tmp3 to <4 x i64>*
571  %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8
572  %tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3>
573  %tmp7 = bitcast i64* %tmp3 to <4 x i64>*
574  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
575  %tmp8 = add i64 %tmp, 4
576  %tmp9 = icmp eq i64 %tmp8, 1024
577  br i1 %tmp9, label %bb10, label %bb2
578
579bb10:                                             ; preds = %bb2
580  ret void
581}
582
583define void @bcast_unfold_or_v2i64(i64* %arg) {
584; CHECK-LABEL: bcast_unfold_or_v2i64:
585; CHECK:       # %bb.0: # %bb
586; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
587; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [3,3]
588; CHECK-NEXT:    .p2align 4, 0x90
589; CHECK-NEXT:  .LBB17_1: # %bb2
590; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
591; CHECK-NEXT:    vorps 8192(%rdi,%rax), %xmm0, %xmm1
592; CHECK-NEXT:    vmovups %xmm1, 8192(%rdi,%rax)
593; CHECK-NEXT:    addq $16, %rax
594; CHECK-NEXT:    jne .LBB17_1
595; CHECK-NEXT:  # %bb.2: # %bb10
596; CHECK-NEXT:    retq
597bb:
598  br label %bb2
599
600bb2:                                              ; preds = %bb2, %bb
601  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ]
602  %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp
603  %tmp4 = bitcast i64* %tmp3 to <2 x i64>*
604  %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8
605  %tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3>
606  %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
607  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
608  %tmp8 = add i64 %tmp, 2
609  %tmp9 = icmp eq i64 %tmp8, 1024
610  br i1 %tmp9, label %bb10, label %bb2
611
612bb10:                                             ; preds = %bb2
613  ret void
614}
615
616define void @bcast_unfold_fneg_v16f32(float* %arg) {
617; CHECK-LABEL: bcast_unfold_fneg_v16f32:
618; CHECK:       # %bb.0: # %bb
619; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
620; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
621; CHECK-NEXT:    .p2align 4, 0x90
622; CHECK-NEXT:  .LBB18_1: # %bb1
623; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
624; CHECK-NEXT:    vpxord 4096(%rdi,%rax), %zmm0, %zmm1
625; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
626; CHECK-NEXT:    addq $64, %rax
627; CHECK-NEXT:    jne .LBB18_1
628; CHECK-NEXT:  # %bb.2: # %bb9
629; CHECK-NEXT:    vzeroupper
630; CHECK-NEXT:    retq
631bb:
632  br label %bb1
633
634bb1:                                              ; preds = %bb1, %bb
635  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
636  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
637  %tmp3 = bitcast float* %tmp2 to <16 x float>*
638  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
639  %tmp5 = fneg <16 x float> %tmp4
640  %tmp6 = bitcast float* %tmp2 to <16 x float>*
641  store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
642  %tmp7 = add i64 %tmp, 16
643  %tmp8 = icmp eq i64 %tmp7, 1024
644  br i1 %tmp8, label %bb9, label %bb1
645
646bb9:                                              ; preds = %bb1
647  ret void
648}
649
650define void @bcast_unfold_fneg_v8f32(float* %arg) {
651; CHECK-LABEL: bcast_unfold_fneg_v8f32:
652; CHECK:       # %bb.0: # %bb
653; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
654; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
655; CHECK-NEXT:    .p2align 4, 0x90
656; CHECK-NEXT:  .LBB19_1: # %bb1
657; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
658; CHECK-NEXT:    vxorps 4096(%rdi,%rax), %ymm0, %ymm1
659; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
660; CHECK-NEXT:    addq $32, %rax
661; CHECK-NEXT:    jne .LBB19_1
662; CHECK-NEXT:  # %bb.2: # %bb9
663; CHECK-NEXT:    vzeroupper
664; CHECK-NEXT:    retq
665bb:
666  br label %bb1
667
668bb1:                                              ; preds = %bb1, %bb
669  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
670  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
671  %tmp3 = bitcast float* %tmp2 to <8 x float>*
672  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
673  %tmp5 = fneg <8 x float> %tmp4
674  %tmp6 = bitcast float* %tmp2 to <8 x float>*
675  store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
676  %tmp7 = add i64 %tmp, 8
677  %tmp8 = icmp eq i64 %tmp7, 1024
678  br i1 %tmp8, label %bb9, label %bb1
679
680bb9:                                              ; preds = %bb1
681  ret void
682}
683
684define void @bcast_unfold_fneg_v4f32(float* %arg) {
685; CHECK-LABEL: bcast_unfold_fneg_v4f32:
686; CHECK:       # %bb.0: # %bb
687; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
688; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
689; CHECK-NEXT:    .p2align 4, 0x90
690; CHECK-NEXT:  .LBB20_1: # %bb1
691; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
692; CHECK-NEXT:    vxorps 4096(%rdi,%rax), %xmm0, %xmm1
693; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
694; CHECK-NEXT:    addq $16, %rax
695; CHECK-NEXT:    jne .LBB20_1
696; CHECK-NEXT:  # %bb.2: # %bb9
697; CHECK-NEXT:    retq
698bb:
699  br label %bb1
700
701bb1:                                              ; preds = %bb1, %bb
702  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
703  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
704  %tmp3 = bitcast float* %tmp2 to <4 x float>*
705  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
706  %tmp5 = fneg <4 x float> %tmp4
707  %tmp6 = bitcast float* %tmp2 to <4 x float>*
708  store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
709  %tmp7 = add i64 %tmp, 4
710  %tmp8 = icmp eq i64 %tmp7, 1024
711  br i1 %tmp8, label %bb9, label %bb1
712
713bb9:                                              ; preds = %bb1
714  ret void
715}
716
717define void @bcast_unfold_fneg_v8f64(double* %arg) {
718; CHECK-LABEL: bcast_unfold_fneg_v8f64:
719; CHECK:       # %bb.0: # %bb
720; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
721; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
722; CHECK-NEXT:    .p2align 4, 0x90
723; CHECK-NEXT:  .LBB21_1: # %bb1
724; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
725; CHECK-NEXT:    vpxorq 8192(%rdi,%rax), %zmm0, %zmm1
726; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
727; CHECK-NEXT:    addq $64, %rax
728; CHECK-NEXT:    jne .LBB21_1
729; CHECK-NEXT:  # %bb.2: # %bb9
730; CHECK-NEXT:    vzeroupper
731; CHECK-NEXT:    retq
732bb:
733  br label %bb1
734
735bb1:                                              ; preds = %bb1, %bb
736  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
737  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
738  %tmp3 = bitcast double* %tmp2 to <8 x double>*
739  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
740  %tmp5 = fneg <8 x double> %tmp4
741  %tmp6 = bitcast double* %tmp2 to <8 x double>*
742  store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
743  %tmp7 = add i64 %tmp, 8
744  %tmp8 = icmp eq i64 %tmp7, 1024
745  br i1 %tmp8, label %bb9, label %bb1
746
747bb9:                                              ; preds = %bb1
748  ret void
749}
750
751define void @bcast_unfold_fneg_v4f64(double* %arg) {
752; CHECK-LABEL: bcast_unfold_fneg_v4f64:
753; CHECK:       # %bb.0: # %bb
754; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
755; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
756; CHECK-NEXT:    .p2align 4, 0x90
757; CHECK-NEXT:  .LBB22_1: # %bb1
758; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
759; CHECK-NEXT:    vxorps 8192(%rdi,%rax), %ymm0, %ymm1
760; CHECK-NEXT:    vmovups %ymm1, 8192(%rdi,%rax)
761; CHECK-NEXT:    addq $32, %rax
762; CHECK-NEXT:    jne .LBB22_1
763; CHECK-NEXT:  # %bb.2: # %bb9
764; CHECK-NEXT:    vzeroupper
765; CHECK-NEXT:    retq
766bb:
767  br label %bb1
768
769bb1:                                              ; preds = %bb1, %bb
770  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
771  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
772  %tmp3 = bitcast double* %tmp2 to <4 x double>*
773  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
774  %tmp5 = fneg <4 x double> %tmp4
775  %tmp6 = bitcast double* %tmp2 to <4 x double>*
776  store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
777  %tmp7 = add i64 %tmp, 4
778  %tmp8 = icmp eq i64 %tmp7, 1024
779  br i1 %tmp8, label %bb9, label %bb1
780
781bb9:                                              ; preds = %bb1
782  ret void
783}
784
785define void @bcast_unfold_fneg_v2f64(double* %arg) {
786; CHECK-LABEL: bcast_unfold_fneg_v2f64:
787; CHECK:       # %bb.0: # %bb
788; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
789; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0]
790; CHECK-NEXT:    .p2align 4, 0x90
791; CHECK-NEXT:  .LBB23_1: # %bb1
792; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
793; CHECK-NEXT:    vxorps 8192(%rdi,%rax), %xmm0, %xmm1
794; CHECK-NEXT:    vmovups %xmm1, 8192(%rdi,%rax)
795; CHECK-NEXT:    addq $16, %rax
796; CHECK-NEXT:    jne .LBB23_1
797; CHECK-NEXT:  # %bb.2: # %bb9
798; CHECK-NEXT:    retq
799bb:
800  br label %bb1
801
802bb1:                                              ; preds = %bb1, %bb
803  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
804  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
805  %tmp3 = bitcast double* %tmp2 to <2 x double>*
806  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
807  %tmp5 = fneg <2 x double> %tmp4
808  %tmp6 = bitcast double* %tmp2 to <2 x double>*
809  store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
810  %tmp7 = add i64 %tmp, 2
811  %tmp8 = icmp eq i64 %tmp7, 1024
812  br i1 %tmp8, label %bb9, label %bb1
813
814bb9:                                              ; preds = %bb1
815  ret void
816}
817
818define void @bcast_unfold_fabs_v16f32(float* %arg) {
819; CHECK-LABEL: bcast_unfold_fabs_v16f32:
820; CHECK:       # %bb.0: # %bb
821; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
822; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
823; CHECK-NEXT:    .p2align 4, 0x90
824; CHECK-NEXT:  .LBB24_1: # %bb1
825; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
826; CHECK-NEXT:    vpandd 4096(%rdi,%rax), %zmm0, %zmm1
827; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
828; CHECK-NEXT:    addq $64, %rax
829; CHECK-NEXT:    jne .LBB24_1
830; CHECK-NEXT:  # %bb.2: # %bb9
831; CHECK-NEXT:    vzeroupper
832; CHECK-NEXT:    retq
833bb:
834  br label %bb1
835
836bb1:                                              ; preds = %bb1, %bb
837  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
838  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
839  %tmp3 = bitcast float* %tmp2 to <16 x float>*
840  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
841  %tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4)
842  %tmp6 = bitcast float* %tmp2 to <16 x float>*
843  store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
844  %tmp7 = add i64 %tmp, 16
845  %tmp8 = icmp eq i64 %tmp7, 1024
846  br i1 %tmp8, label %bb9, label %bb1
847
848bb9:                                              ; preds = %bb1
849  ret void
850}
851
852; Function Attrs: nounwind readnone speculatable willreturn
853declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0
854
855define void @bcast_unfold_fabs_v8f32(float* %arg) {
856; CHECK-LABEL: bcast_unfold_fabs_v8f32:
857; CHECK:       # %bb.0: # %bb
858; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
859; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
860; CHECK-NEXT:    .p2align 4, 0x90
861; CHECK-NEXT:  .LBB25_1: # %bb1
862; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
863; CHECK-NEXT:    vandps 4096(%rdi,%rax), %ymm0, %ymm1
864; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
865; CHECK-NEXT:    addq $32, %rax
866; CHECK-NEXT:    jne .LBB25_1
867; CHECK-NEXT:  # %bb.2: # %bb9
868; CHECK-NEXT:    vzeroupper
869; CHECK-NEXT:    retq
870bb:
871  br label %bb1
872
873bb1:                                              ; preds = %bb1, %bb
874  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
875  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
876  %tmp3 = bitcast float* %tmp2 to <8 x float>*
877  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
878  %tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4)
879  %tmp6 = bitcast float* %tmp2 to <8 x float>*
880  store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
881  %tmp7 = add i64 %tmp, 8
882  %tmp8 = icmp eq i64 %tmp7, 1024
883  br i1 %tmp8, label %bb9, label %bb1
884
885bb9:                                              ; preds = %bb1
886  ret void
887}
888
889; Function Attrs: nounwind readnone speculatable willreturn
890declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0
891
892define void @bcast_unfold_fabs_v4f32(float* %arg) {
893; CHECK-LABEL: bcast_unfold_fabs_v4f32:
894; CHECK:       # %bb.0: # %bb
895; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
896; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
897; CHECK-NEXT:    .p2align 4, 0x90
898; CHECK-NEXT:  .LBB26_1: # %bb1
899; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
900; CHECK-NEXT:    vandps 4096(%rdi,%rax), %xmm0, %xmm1
901; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
902; CHECK-NEXT:    addq $16, %rax
903; CHECK-NEXT:    jne .LBB26_1
904; CHECK-NEXT:  # %bb.2: # %bb9
905; CHECK-NEXT:    retq
906bb:
907  br label %bb1
908
909bb1:                                              ; preds = %bb1, %bb
910  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
911  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
912  %tmp3 = bitcast float* %tmp2 to <4 x float>*
913  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
914  %tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4)
915  %tmp6 = bitcast float* %tmp2 to <4 x float>*
916  store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
917  %tmp7 = add i64 %tmp, 4
918  %tmp8 = icmp eq i64 %tmp7, 1024
919  br i1 %tmp8, label %bb9, label %bb1
920
921bb9:                                              ; preds = %bb1
922  ret void
923}
924
925; Function Attrs: nounwind readnone speculatable willreturn
926declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0
927
928define void @bcast_unfold_fabs_v8f64(double* %arg) {
929; CHECK-LABEL: bcast_unfold_fabs_v8f64:
930; CHECK:       # %bb.0: # %bb
931; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
932; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
933; CHECK-NEXT:    .p2align 4, 0x90
934; CHECK-NEXT:  .LBB27_1: # %bb1
935; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
936; CHECK-NEXT:    vpandq 8192(%rdi,%rax), %zmm0, %zmm1
937; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
938; CHECK-NEXT:    addq $64, %rax
939; CHECK-NEXT:    jne .LBB27_1
940; CHECK-NEXT:  # %bb.2: # %bb9
941; CHECK-NEXT:    vzeroupper
942; CHECK-NEXT:    retq
943bb:
944  br label %bb1
945
946bb1:                                              ; preds = %bb1, %bb
947  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
948  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
949  %tmp3 = bitcast double* %tmp2 to <8 x double>*
950  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
951  %tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4)
952  %tmp6 = bitcast double* %tmp2 to <8 x double>*
953  store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
954  %tmp7 = add i64 %tmp, 8
955  %tmp8 = icmp eq i64 %tmp7, 1024
956  br i1 %tmp8, label %bb9, label %bb1
957
958bb9:                                              ; preds = %bb1
959  ret void
960}
961
962; Function Attrs: nounwind readnone speculatable willreturn
963declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0
964
965define void @bcast_unfold_fabs_v4f64(double* %arg) {
966; CHECK-LABEL: bcast_unfold_fabs_v4f64:
967; CHECK:       # %bb.0: # %bb
968; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
969; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN]
970; CHECK-NEXT:    .p2align 4, 0x90
971; CHECK-NEXT:  .LBB28_1: # %bb1
972; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
973; CHECK-NEXT:    vandps 8192(%rdi,%rax), %ymm0, %ymm1
974; CHECK-NEXT:    vmovups %ymm1, 8192(%rdi,%rax)
975; CHECK-NEXT:    addq $32, %rax
976; CHECK-NEXT:    jne .LBB28_1
977; CHECK-NEXT:  # %bb.2: # %bb9
978; CHECK-NEXT:    vzeroupper
979; CHECK-NEXT:    retq
980bb:
981  br label %bb1
982
983bb1:                                              ; preds = %bb1, %bb
984  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
985  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
986  %tmp3 = bitcast double* %tmp2 to <4 x double>*
987  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
988  %tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4)
989  %tmp6 = bitcast double* %tmp2 to <4 x double>*
990  store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
991  %tmp7 = add i64 %tmp, 4
992  %tmp8 = icmp eq i64 %tmp7, 1024
993  br i1 %tmp8, label %bb9, label %bb1
994
995bb9:                                              ; preds = %bb1
996  ret void
997}
998
999; Function Attrs: nounwind readnone speculatable willreturn
1000declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0
1001
1002define void @bcast_unfold_fabs_v2f64(double* %arg) {
1003; CHECK-LABEL: bcast_unfold_fabs_v2f64:
1004; CHECK:       # %bb.0: # %bb
1005; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1006; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN]
1007; CHECK-NEXT:    .p2align 4, 0x90
1008; CHECK-NEXT:  .LBB29_1: # %bb1
1009; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1010; CHECK-NEXT:    vandps 8192(%rdi,%rax), %xmm0, %xmm1
1011; CHECK-NEXT:    vmovups %xmm1, 8192(%rdi,%rax)
1012; CHECK-NEXT:    addq $16, %rax
1013; CHECK-NEXT:    jne .LBB29_1
1014; CHECK-NEXT:  # %bb.2: # %bb9
1015; CHECK-NEXT:    retq
1016bb:
1017  br label %bb1
1018
1019bb1:                                              ; preds = %bb1, %bb
1020  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1021  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1022  %tmp3 = bitcast double* %tmp2 to <2 x double>*
1023  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1024  %tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4)
1025  %tmp6 = bitcast double* %tmp2 to <2 x double>*
1026  store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1027  %tmp7 = add i64 %tmp, 2
1028  %tmp8 = icmp eq i64 %tmp7, 1024
1029  br i1 %tmp8, label %bb9, label %bb1
1030
1031bb9:                                              ; preds = %bb1
1032  ret void
1033}
1034
1035; Function Attrs: nounwind readnone speculatable willreturn
1036declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
1037
1038define void @bcast_unfold_fadd_v16f32(float* nocapture %arg) {
1039; CHECK-LABEL: bcast_unfold_fadd_v16f32:
1040; CHECK:       # %bb.0: # %bb
1041; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1042; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1043; CHECK-NEXT:    .p2align 4, 0x90
1044; CHECK-NEXT:  .LBB30_1: # %bb1
1045; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1046; CHECK-NEXT:    vaddps 4096(%rdi,%rax), %zmm0, %zmm1
1047; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
1048; CHECK-NEXT:    addq $64, %rax
1049; CHECK-NEXT:    jne .LBB30_1
1050; CHECK-NEXT:  # %bb.2: # %bb9
1051; CHECK-NEXT:    vzeroupper
1052; CHECK-NEXT:    retq
1053bb:
1054  br label %bb1
1055
1056bb1:                                              ; preds = %bb1, %bb
1057  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1058  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1059  %tmp3 = bitcast float* %tmp2 to <16 x float>*
1060  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1061  %tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1062  %tmp6 = bitcast float* %tmp2 to <16 x float>*
1063  store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1064  %tmp7 = add i64 %tmp, 16
1065  %tmp8 = icmp eq i64 %tmp7, 1024
1066  br i1 %tmp8, label %bb9, label %bb1
1067
1068bb9:                                              ; preds = %bb1
1069  ret void
1070}
1071
1072define void @bcast_unfold_fadd_v8f32(float* nocapture %arg) {
1073; CHECK-LABEL: bcast_unfold_fadd_v8f32:
1074; CHECK:       # %bb.0: # %bb
1075; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1076; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1077; CHECK-NEXT:    .p2align 4, 0x90
1078; CHECK-NEXT:  .LBB31_1: # %bb1
1079; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1080; CHECK-NEXT:    vaddps 4096(%rdi,%rax), %ymm0, %ymm1
1081; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
1082; CHECK-NEXT:    addq $32, %rax
1083; CHECK-NEXT:    jne .LBB31_1
1084; CHECK-NEXT:  # %bb.2: # %bb9
1085; CHECK-NEXT:    vzeroupper
1086; CHECK-NEXT:    retq
1087bb:
1088  br label %bb1
1089
1090bb1:                                              ; preds = %bb1, %bb
1091  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1092  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1093  %tmp3 = bitcast float* %tmp2 to <8 x float>*
1094  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1095  %tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1096  %tmp6 = bitcast float* %tmp2 to <8 x float>*
1097  store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1098  %tmp7 = add i64 %tmp, 8
1099  %tmp8 = icmp eq i64 %tmp7, 1024
1100  br i1 %tmp8, label %bb9, label %bb1
1101
1102bb9:                                              ; preds = %bb1
1103  ret void
1104}
1105
1106define void @bcast_unfold_fadd_v4f32(float* nocapture %arg) {
1107; CHECK-LABEL: bcast_unfold_fadd_v4f32:
1108; CHECK:       # %bb.0: # %bb
1109; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1110; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1111; CHECK-NEXT:    .p2align 4, 0x90
1112; CHECK-NEXT:  .LBB32_1: # %bb1
1113; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1114; CHECK-NEXT:    vaddps 4096(%rdi,%rax), %xmm0, %xmm1
1115; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
1116; CHECK-NEXT:    addq $16, %rax
1117; CHECK-NEXT:    jne .LBB32_1
1118; CHECK-NEXT:  # %bb.2: # %bb9
1119; CHECK-NEXT:    retq
1120bb:
1121  br label %bb1
1122
1123bb1:                                              ; preds = %bb1, %bb
1124  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1125  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1126  %tmp3 = bitcast float* %tmp2 to <4 x float>*
1127  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1128  %tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1129  %tmp6 = bitcast float* %tmp2 to <4 x float>*
1130  store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1131  %tmp7 = add i64 %tmp, 4
1132  %tmp8 = icmp eq i64 %tmp7, 1024
1133  br i1 %tmp8, label %bb9, label %bb1
1134
1135bb9:                                              ; preds = %bb1
1136  ret void
1137}
1138
1139define void @bcast_unfold_fadd_v8f64(double* nocapture %arg) {
1140; CHECK-LABEL: bcast_unfold_fadd_v8f64:
1141; CHECK:       # %bb.0: # %bb
1142; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1143; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1144; CHECK-NEXT:    .p2align 4, 0x90
1145; CHECK-NEXT:  .LBB33_1: # %bb1
1146; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1147; CHECK-NEXT:    vaddpd 8192(%rdi,%rax), %zmm0, %zmm1
1148; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
1149; CHECK-NEXT:    addq $64, %rax
1150; CHECK-NEXT:    jne .LBB33_1
1151; CHECK-NEXT:  # %bb.2: # %bb9
1152; CHECK-NEXT:    vzeroupper
1153; CHECK-NEXT:    retq
1154bb:
1155  br label %bb1
1156
1157bb1:                                              ; preds = %bb1, %bb
1158  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1159  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1160  %tmp3 = bitcast double* %tmp2 to <8 x double>*
1161  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1162  %tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1163  %tmp6 = bitcast double* %tmp2 to <8 x double>*
1164  store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1165  %tmp7 = add i64 %tmp, 8
1166  %tmp8 = icmp eq i64 %tmp7, 1024
1167  br i1 %tmp8, label %bb9, label %bb1
1168
1169bb9:                                              ; preds = %bb1
1170  ret void
1171}
1172
1173define void @bcast_unfold_fadd_v4f64(double* nocapture %arg) {
1174; CHECK-LABEL: bcast_unfold_fadd_v4f64:
1175; CHECK:       # %bb.0: # %bb
1176; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1177; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1178; CHECK-NEXT:    .p2align 4, 0x90
1179; CHECK-NEXT:  .LBB34_1: # %bb1
1180; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1181; CHECK-NEXT:    vaddpd 8192(%rdi,%rax), %ymm0, %ymm1
1182; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
1183; CHECK-NEXT:    addq $32, %rax
1184; CHECK-NEXT:    jne .LBB34_1
1185; CHECK-NEXT:  # %bb.2: # %bb9
1186; CHECK-NEXT:    vzeroupper
1187; CHECK-NEXT:    retq
1188bb:
1189  br label %bb1
1190
1191bb1:                                              ; preds = %bb1, %bb
1192  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1193  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1194  %tmp3 = bitcast double* %tmp2 to <4 x double>*
1195  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1196  %tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1197  %tmp6 = bitcast double* %tmp2 to <4 x double>*
1198  store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1199  %tmp7 = add i64 %tmp, 4
1200  %tmp8 = icmp eq i64 %tmp7, 1024
1201  br i1 %tmp8, label %bb9, label %bb1
1202
1203bb9:                                              ; preds = %bb1
1204  ret void
1205}
1206
1207define void @bcast_unfold_fadd_v2f64(double* nocapture %arg) {
1208; CHECK-LABEL: bcast_unfold_fadd_v2f64:
1209; CHECK:       # %bb.0: # %bb
1210; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1211; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1212; CHECK-NEXT:    .p2align 4, 0x90
1213; CHECK-NEXT:  .LBB35_1: # %bb1
1214; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1215; CHECK-NEXT:    vaddpd 8192(%rdi,%rax), %xmm0, %xmm1
1216; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
1217; CHECK-NEXT:    addq $16, %rax
1218; CHECK-NEXT:    jne .LBB35_1
1219; CHECK-NEXT:  # %bb.2: # %bb9
1220; CHECK-NEXT:    retq
1221bb:
1222  br label %bb1
1223
1224bb1:                                              ; preds = %bb1, %bb
1225  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1226  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1227  %tmp3 = bitcast double* %tmp2 to <2 x double>*
1228  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1229  %tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1230  %tmp6 = bitcast double* %tmp2 to <2 x double>*
1231  store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1232  %tmp7 = add i64 %tmp, 2
1233  %tmp8 = icmp eq i64 %tmp7, 1024
1234  br i1 %tmp8, label %bb9, label %bb1
1235
1236bb9:                                              ; preds = %bb1
1237  ret void
1238}
1239
1240define void @bcast_unfold_fmul_v16f32(float* nocapture %arg) {
1241; CHECK-LABEL: bcast_unfold_fmul_v16f32:
1242; CHECK:       # %bb.0: # %bb
1243; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1244; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1245; CHECK-NEXT:    .p2align 4, 0x90
1246; CHECK-NEXT:  .LBB36_1: # %bb1
1247; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1248; CHECK-NEXT:    vmulps 4096(%rdi,%rax), %zmm0, %zmm1
1249; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
1250; CHECK-NEXT:    addq $64, %rax
1251; CHECK-NEXT:    jne .LBB36_1
1252; CHECK-NEXT:  # %bb.2: # %bb9
1253; CHECK-NEXT:    vzeroupper
1254; CHECK-NEXT:    retq
1255bb:
1256  br label %bb1
1257
1258bb1:                                              ; preds = %bb1, %bb
1259  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1260  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1261  %tmp3 = bitcast float* %tmp2 to <16 x float>*
1262  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1263  %tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1264  %tmp6 = bitcast float* %tmp2 to <16 x float>*
1265  store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1266  %tmp7 = add i64 %tmp, 16
1267  %tmp8 = icmp eq i64 %tmp7, 1024
1268  br i1 %tmp8, label %bb9, label %bb1
1269
1270bb9:                                              ; preds = %bb1
1271  ret void
1272}
1273
1274define void @bcast_unfold_fmul_v8f32(float* nocapture %arg) {
1275; CHECK-LABEL: bcast_unfold_fmul_v8f32:
1276; CHECK:       # %bb.0: # %bb
1277; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1278; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1279; CHECK-NEXT:    .p2align 4, 0x90
1280; CHECK-NEXT:  .LBB37_1: # %bb1
1281; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1282; CHECK-NEXT:    vmulps 4096(%rdi,%rax), %ymm0, %ymm1
1283; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
1284; CHECK-NEXT:    addq $32, %rax
1285; CHECK-NEXT:    jne .LBB37_1
1286; CHECK-NEXT:  # %bb.2: # %bb9
1287; CHECK-NEXT:    vzeroupper
1288; CHECK-NEXT:    retq
1289bb:
1290  br label %bb1
1291
1292bb1:                                              ; preds = %bb1, %bb
1293  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1294  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1295  %tmp3 = bitcast float* %tmp2 to <8 x float>*
1296  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1297  %tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1298  %tmp6 = bitcast float* %tmp2 to <8 x float>*
1299  store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1300  %tmp7 = add i64 %tmp, 8
1301  %tmp8 = icmp eq i64 %tmp7, 1024
1302  br i1 %tmp8, label %bb9, label %bb1
1303
1304bb9:                                              ; preds = %bb1
1305  ret void
1306}
1307
1308define void @bcast_unfold_fmul_v4f32(float* nocapture %arg) {
1309; CHECK-LABEL: bcast_unfold_fmul_v4f32:
1310; CHECK:       # %bb.0: # %bb
1311; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1312; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1313; CHECK-NEXT:    .p2align 4, 0x90
1314; CHECK-NEXT:  .LBB38_1: # %bb1
1315; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1316; CHECK-NEXT:    vmulps 4096(%rdi,%rax), %xmm0, %xmm1
1317; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
1318; CHECK-NEXT:    addq $16, %rax
1319; CHECK-NEXT:    jne .LBB38_1
1320; CHECK-NEXT:  # %bb.2: # %bb9
1321; CHECK-NEXT:    retq
1322bb:
1323  br label %bb1
1324
1325bb1:                                              ; preds = %bb1, %bb
1326  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1327  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1328  %tmp3 = bitcast float* %tmp2 to <4 x float>*
1329  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1330  %tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
1331  %tmp6 = bitcast float* %tmp2 to <4 x float>*
1332  store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1333  %tmp7 = add i64 %tmp, 4
1334  %tmp8 = icmp eq i64 %tmp7, 1024
1335  br i1 %tmp8, label %bb9, label %bb1
1336
1337bb9:                                              ; preds = %bb1
1338  ret void
1339}
1340
1341define void @bcast_unfold_fmul_v8f64(double* nocapture %arg) {
1342; CHECK-LABEL: bcast_unfold_fmul_v8f64:
1343; CHECK:       # %bb.0: # %bb
1344; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1345; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1346; CHECK-NEXT:    .p2align 4, 0x90
1347; CHECK-NEXT:  .LBB39_1: # %bb1
1348; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1349; CHECK-NEXT:    vmulpd 8192(%rdi,%rax), %zmm0, %zmm1
1350; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
1351; CHECK-NEXT:    addq $64, %rax
1352; CHECK-NEXT:    jne .LBB39_1
1353; CHECK-NEXT:  # %bb.2: # %bb9
1354; CHECK-NEXT:    vzeroupper
1355; CHECK-NEXT:    retq
1356bb:
1357  br label %bb1
1358
1359bb1:                                              ; preds = %bb1, %bb
1360  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1361  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1362  %tmp3 = bitcast double* %tmp2 to <8 x double>*
1363  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1364  %tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1365  %tmp6 = bitcast double* %tmp2 to <8 x double>*
1366  store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1367  %tmp7 = add i64 %tmp, 8
1368  %tmp8 = icmp eq i64 %tmp7, 1024
1369  br i1 %tmp8, label %bb9, label %bb1
1370
1371bb9:                                              ; preds = %bb1
1372  ret void
1373}
1374
1375define void @bcast_unfold_fmul_v4f64(double* nocapture %arg) {
1376; CHECK-LABEL: bcast_unfold_fmul_v4f64:
1377; CHECK:       # %bb.0: # %bb
1378; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1379; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
1380; CHECK-NEXT:    .p2align 4, 0x90
1381; CHECK-NEXT:  .LBB40_1: # %bb1
1382; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1383; CHECK-NEXT:    vmulpd 8192(%rdi,%rax), %ymm0, %ymm1
1384; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
1385; CHECK-NEXT:    addq $32, %rax
1386; CHECK-NEXT:    jne .LBB40_1
1387; CHECK-NEXT:  # %bb.2: # %bb9
1388; CHECK-NEXT:    vzeroupper
1389; CHECK-NEXT:    retq
1390bb:
1391  br label %bb1
1392
1393bb1:                                              ; preds = %bb1, %bb
1394  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1395  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1396  %tmp3 = bitcast double* %tmp2 to <4 x double>*
1397  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1398  %tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
1399  %tmp6 = bitcast double* %tmp2 to <4 x double>*
1400  store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1401  %tmp7 = add i64 %tmp, 4
1402  %tmp8 = icmp eq i64 %tmp7, 1024
1403  br i1 %tmp8, label %bb9, label %bb1
1404
1405bb9:                                              ; preds = %bb1
1406  ret void
1407}
1408
1409define void @bcast_unfold_fmul_v2f64(double* nocapture %arg) {
1410; CHECK-LABEL: bcast_unfold_fmul_v2f64:
1411; CHECK:       # %bb.0: # %bb
1412; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1413; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [3.0E+0,3.0E+0]
1414; CHECK-NEXT:    .p2align 4, 0x90
1415; CHECK-NEXT:  .LBB41_1: # %bb1
1416; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1417; CHECK-NEXT:    vmulpd 8192(%rdi,%rax), %xmm0, %xmm1
1418; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
1419; CHECK-NEXT:    addq $16, %rax
1420; CHECK-NEXT:    jne .LBB41_1
1421; CHECK-NEXT:  # %bb.2: # %bb9
1422; CHECK-NEXT:    retq
1423bb:
1424  br label %bb1
1425
1426bb1:                                              ; preds = %bb1, %bb
1427  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1428  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1429  %tmp3 = bitcast double* %tmp2 to <2 x double>*
1430  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1431  %tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00>
1432  %tmp6 = bitcast double* %tmp2 to <2 x double>*
1433  store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1434  %tmp7 = add i64 %tmp, 2
1435  %tmp8 = icmp eq i64 %tmp7, 1024
1436  br i1 %tmp8, label %bb9, label %bb1
1437
1438bb9:                                              ; preds = %bb1
1439  ret void
1440}
1441
1442define void @bcast_unfold_fdiv_v16f32(float* nocapture %arg) {
1443; CHECK-LABEL: bcast_unfold_fdiv_v16f32:
1444; CHECK:       # %bb.0: # %bb
1445; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1446; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1447; CHECK-NEXT:    .p2align 4, 0x90
1448; CHECK-NEXT:  .LBB42_1: # %bb1
1449; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1450; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
1451; CHECK-NEXT:    vdivps %zmm0, %zmm1, %zmm1
1452; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
1453; CHECK-NEXT:    addq $64, %rax
1454; CHECK-NEXT:    jne .LBB42_1
1455; CHECK-NEXT:  # %bb.2: # %bb9
1456; CHECK-NEXT:    vzeroupper
1457; CHECK-NEXT:    retq
1458bb:
1459  br label %bb1
1460
1461bb1:                                              ; preds = %bb1, %bb
1462  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1463  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1464  %tmp3 = bitcast float* %tmp2 to <16 x float>*
1465  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1466  %tmp5 = fdiv <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1467  %tmp6 = bitcast float* %tmp2 to <16 x float>*
1468  store <16 x float> %tmp5, <16 x float>* %tmp6, align 4
1469  %tmp7 = add i64 %tmp, 16
1470  %tmp8 = icmp eq i64 %tmp7, 1024
1471  br i1 %tmp8, label %bb9, label %bb1
1472
1473bb9:                                              ; preds = %bb1
1474  ret void
1475}
1476
1477define void @bcast_unfold_fdiv_v8f32(float* nocapture %arg) {
1478; CHECK-LABEL: bcast_unfold_fdiv_v8f32:
1479; CHECK:       # %bb.0: # %bb
1480; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1481; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1482; CHECK-NEXT:    .p2align 4, 0x90
1483; CHECK-NEXT:  .LBB43_1: # %bb1
1484; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1485; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
1486; CHECK-NEXT:    vdivps %ymm0, %ymm1, %ymm1
1487; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
1488; CHECK-NEXT:    addq $32, %rax
1489; CHECK-NEXT:    jne .LBB43_1
1490; CHECK-NEXT:  # %bb.2: # %bb9
1491; CHECK-NEXT:    vzeroupper
1492; CHECK-NEXT:    retq
1493bb:
1494  br label %bb1
1495
1496bb1:                                              ; preds = %bb1, %bb
1497  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1498  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1499  %tmp3 = bitcast float* %tmp2 to <8 x float>*
1500  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1501  %tmp5 = fdiv <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1502  %tmp6 = bitcast float* %tmp2 to <8 x float>*
1503  store <8 x float> %tmp5, <8 x float>* %tmp6, align 4
1504  %tmp7 = add i64 %tmp, 8
1505  %tmp8 = icmp eq i64 %tmp7, 1024
1506  br i1 %tmp8, label %bb9, label %bb1
1507
1508bb9:                                              ; preds = %bb1
1509  ret void
1510}
1511
1512define void @bcast_unfold_fdiv_v4f32(float* nocapture %arg) {
1513; CHECK-LABEL: bcast_unfold_fdiv_v4f32:
1514; CHECK:       # %bb.0: # %bb
1515; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1516; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1517; CHECK-NEXT:    .p2align 4, 0x90
1518; CHECK-NEXT:  .LBB44_1: # %bb1
1519; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1520; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
1521; CHECK-NEXT:    vdivps %xmm0, %xmm1, %xmm1
1522; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
1523; CHECK-NEXT:    addq $16, %rax
1524; CHECK-NEXT:    jne .LBB44_1
1525; CHECK-NEXT:  # %bb.2: # %bb9
1526; CHECK-NEXT:    retq
1527bb:
1528  br label %bb1
1529
1530bb1:                                              ; preds = %bb1, %bb
1531  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1532  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1533  %tmp3 = bitcast float* %tmp2 to <4 x float>*
1534  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1535  %tmp5 = fdiv <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1536  %tmp6 = bitcast float* %tmp2 to <4 x float>*
1537  store <4 x float> %tmp5, <4 x float>* %tmp6, align 4
1538  %tmp7 = add i64 %tmp, 4
1539  %tmp8 = icmp eq i64 %tmp7, 1024
1540  br i1 %tmp8, label %bb9, label %bb1
1541
1542bb9:                                              ; preds = %bb1
1543  ret void
1544}
1545
1546define void @bcast_unfold_fdiv_v8f64(double* nocapture %arg) {
1547; CHECK-LABEL: bcast_unfold_fdiv_v8f64:
1548; CHECK:       # %bb.0: # %bb
1549; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1550; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1551; CHECK-NEXT:    .p2align 4, 0x90
1552; CHECK-NEXT:  .LBB45_1: # %bb1
1553; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1554; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
1555; CHECK-NEXT:    vdivpd %zmm0, %zmm1, %zmm1
1556; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
1557; CHECK-NEXT:    addq $64, %rax
1558; CHECK-NEXT:    jne .LBB45_1
1559; CHECK-NEXT:  # %bb.2: # %bb9
1560; CHECK-NEXT:    vzeroupper
1561; CHECK-NEXT:    retq
1562bb:
1563  br label %bb1
1564
1565bb1:                                              ; preds = %bb1, %bb
1566  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1567  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1568  %tmp3 = bitcast double* %tmp2 to <8 x double>*
1569  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
1570  %tmp5 = fdiv <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1571  %tmp6 = bitcast double* %tmp2 to <8 x double>*
1572  store <8 x double> %tmp5, <8 x double>* %tmp6, align 8
1573  %tmp7 = add i64 %tmp, 8
1574  %tmp8 = icmp eq i64 %tmp7, 1024
1575  br i1 %tmp8, label %bb9, label %bb1
1576
1577bb9:                                              ; preds = %bb1
1578  ret void
1579}
1580
1581define void @bcast_unfold_fdiv_v4f64(double* nocapture %arg) {
1582; CHECK-LABEL: bcast_unfold_fdiv_v4f64:
1583; CHECK:       # %bb.0: # %bb
1584; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1585; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1586; CHECK-NEXT:    .p2align 4, 0x90
1587; CHECK-NEXT:  .LBB46_1: # %bb1
1588; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1589; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
1590; CHECK-NEXT:    vdivpd %ymm0, %ymm1, %ymm1
1591; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
1592; CHECK-NEXT:    addq $32, %rax
1593; CHECK-NEXT:    jne .LBB46_1
1594; CHECK-NEXT:  # %bb.2: # %bb9
1595; CHECK-NEXT:    vzeroupper
1596; CHECK-NEXT:    retq
1597bb:
1598  br label %bb1
1599
1600bb1:                                              ; preds = %bb1, %bb
1601  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1602  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1603  %tmp3 = bitcast double* %tmp2 to <4 x double>*
1604  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1605  %tmp5 = fdiv <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1606  %tmp6 = bitcast double* %tmp2 to <4 x double>*
1607  store <4 x double> %tmp5, <4 x double>* %tmp6, align 8
1608  %tmp7 = add i64 %tmp, 4
1609  %tmp8 = icmp eq i64 %tmp7, 1024
1610  br i1 %tmp8, label %bb9, label %bb1
1611
1612bb9:                                              ; preds = %bb1
1613  ret void
1614}
1615
1616define void @bcast_unfold_fdiv_v2f64(double* nocapture %arg) {
1617; CHECK-LABEL: bcast_unfold_fdiv_v2f64:
1618; CHECK:       # %bb.0: # %bb
1619; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1620; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1621; CHECK-NEXT:    .p2align 4, 0x90
1622; CHECK-NEXT:  .LBB47_1: # %bb1
1623; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1624; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
1625; CHECK-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
1626; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
1627; CHECK-NEXT:    addq $16, %rax
1628; CHECK-NEXT:    jne .LBB47_1
1629; CHECK-NEXT:  # %bb.2: # %bb9
1630; CHECK-NEXT:    retq
1631bb:
1632  br label %bb1
1633
1634bb1:                                              ; preds = %bb1, %bb
1635  %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ]
1636  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1637  %tmp3 = bitcast double* %tmp2 to <2 x double>*
1638  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1639  %tmp5 = fdiv <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1640  %tmp6 = bitcast double* %tmp2 to <2 x double>*
1641  store <2 x double> %tmp5, <2 x double>* %tmp6, align 8
1642  %tmp7 = add i64 %tmp, 2
1643  %tmp8 = icmp eq i64 %tmp7, 1024
1644  br i1 %tmp8, label %bb9, label %bb1
1645
1646bb9:                                              ; preds = %bb1
1647  ret void
1648}
1649
1650define void @bcast_unfold_fma213_v4f32(float* %arg) {
1651; CHECK-LABEL: bcast_unfold_fma213_v4f32:
1652; CHECK:       # %bb.0: # %bb
1653; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1654; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1655; CHECK-NEXT:    .p2align 4, 0x90
1656; CHECK-NEXT:  .LBB48_1: # %bb2
1657; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1658; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
1659; CHECK-NEXT:    vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1660; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
1661; CHECK-NEXT:    addq $16, %rax
1662; CHECK-NEXT:    jne .LBB48_1
1663; CHECK-NEXT:  # %bb.2: # %bb11
1664; CHECK-NEXT:    retq
1665bb:
1666  br label %bb2
1667
1668bb2:                                              ; preds = %bb2, %bb
1669  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1670  %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1671  %tmp4 = bitcast float* %tmp3 to <4 x float>*
1672  %tmp5 = load <4 x float>, <4 x float>* %tmp4, align 4
1673  %tmp6 = fmul contract <4 x float> %tmp5, %tmp5
1674  %tmp7 = fadd contract <4 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1675  %tmp8 = bitcast float* %tmp3 to <4 x float>*
1676  store <4 x float> %tmp7, <4 x float>* %tmp8, align 4
1677  %tmp9 = add i64 %tmp, 4
1678  %tmp10 = icmp eq i64 %tmp9, 1024
1679  br i1 %tmp10, label %bb11, label %bb2
1680
1681bb11:                                             ; preds = %bb2
1682  ret void
1683}
1684
1685define void @bcast_unfold_fma231_v4f32(float* %arg) {
1686; CHECK-LABEL: bcast_unfold_fma231_v4f32:
1687; CHECK:       # %bb.0: # %bb
1688; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1689; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1690; CHECK-NEXT:    .p2align 4, 0x90
1691; CHECK-NEXT:  .LBB49_1: # %bb1
1692; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1693; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
1694; CHECK-NEXT:    vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1695; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
1696; CHECK-NEXT:    addq $16, %rax
1697; CHECK-NEXT:    jne .LBB49_1
1698; CHECK-NEXT:  # %bb.2: # %bb10
1699; CHECK-NEXT:    retq
1700bb:
1701  br label %bb1
1702
1703bb1:                                              ; preds = %bb1, %bb
1704  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1705  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1706  %tmp3 = bitcast float* %tmp2 to <4 x float>*
1707  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
1708  %tmp5 = fmul contract <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1709  %tmp6 = fadd contract <4 x float> %tmp4, %tmp5
1710  %tmp7 = bitcast float* %tmp2 to <4 x float>*
1711  store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
1712  %tmp8 = add i64 %tmp, 4
1713  %tmp9 = icmp eq i64 %tmp8, 1024
1714  br i1 %tmp9, label %bb10, label %bb1
1715
1716bb10:                                             ; preds = %bb1
1717  ret void
1718}
1719
1720define void @bcast_unfold_fma213_v8f32(float* %arg) {
1721; CHECK-LABEL: bcast_unfold_fma213_v8f32:
1722; CHECK:       # %bb.0: # %bb
1723; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1724; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1725; CHECK-NEXT:    .p2align 4, 0x90
1726; CHECK-NEXT:  .LBB50_1: # %bb2
1727; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1728; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
1729; CHECK-NEXT:    vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1730; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
1731; CHECK-NEXT:    addq $32, %rax
1732; CHECK-NEXT:    jne .LBB50_1
1733; CHECK-NEXT:  # %bb.2: # %bb11
1734; CHECK-NEXT:    vzeroupper
1735; CHECK-NEXT:    retq
1736bb:
1737  br label %bb2
1738
1739bb2:                                              ; preds = %bb2, %bb
1740  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1741  %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1742  %tmp4 = bitcast float* %tmp3 to <8 x float>*
1743  %tmp5 = load <8 x float>, <8 x float>* %tmp4, align 4
1744  %tmp6 = fmul contract <8 x float> %tmp5, %tmp5
1745  %tmp7 = fadd contract <8 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1746  %tmp8 = bitcast float* %tmp3 to <8 x float>*
1747  store <8 x float> %tmp7, <8 x float>* %tmp8, align 4
1748  %tmp9 = add i64 %tmp, 8
1749  %tmp10 = icmp eq i64 %tmp9, 1024
1750  br i1 %tmp10, label %bb11, label %bb2
1751
1752bb11:                                             ; preds = %bb2
1753  ret void
1754}
1755
1756define void @bcast_unfold_fma231_v8f32(float* %arg) {
1757; CHECK-LABEL: bcast_unfold_fma231_v8f32:
1758; CHECK:       # %bb.0: # %bb
1759; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1760; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1761; CHECK-NEXT:    .p2align 4, 0x90
1762; CHECK-NEXT:  .LBB51_1: # %bb1
1763; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1764; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
1765; CHECK-NEXT:    vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1766; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
1767; CHECK-NEXT:    addq $32, %rax
1768; CHECK-NEXT:    jne .LBB51_1
1769; CHECK-NEXT:  # %bb.2: # %bb10
1770; CHECK-NEXT:    vzeroupper
1771; CHECK-NEXT:    retq
1772bb:
1773  br label %bb1
1774
1775bb1:                                              ; preds = %bb1, %bb
1776  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1777  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1778  %tmp3 = bitcast float* %tmp2 to <8 x float>*
1779  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
1780  %tmp5 = fmul contract <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1781  %tmp6 = fadd contract <8 x float> %tmp4, %tmp5
1782  %tmp7 = bitcast float* %tmp2 to <8 x float>*
1783  store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
1784  %tmp8 = add i64 %tmp, 8
1785  %tmp9 = icmp eq i64 %tmp8, 1024
1786  br i1 %tmp9, label %bb10, label %bb1
1787
1788bb10:                                             ; preds = %bb1
1789  ret void
1790}
1791
1792define void @bcast_unfold_fma213_v16f32(float* %arg) {
1793; CHECK-LABEL: bcast_unfold_fma213_v16f32:
1794; CHECK:       # %bb.0: # %bb
1795; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1796; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1797; CHECK-NEXT:    .p2align 4, 0x90
1798; CHECK-NEXT:  .LBB52_1: # %bb2
1799; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1800; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
1801; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
1802; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
1803; CHECK-NEXT:    addq $64, %rax
1804; CHECK-NEXT:    jne .LBB52_1
1805; CHECK-NEXT:  # %bb.2: # %bb11
1806; CHECK-NEXT:    vzeroupper
1807; CHECK-NEXT:    retq
1808bb:
1809  br label %bb2
1810
1811bb2:                                              ; preds = %bb2, %bb
1812  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1813  %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp
1814  %tmp4 = bitcast float* %tmp3 to <16 x float>*
1815  %tmp5 = load <16 x float>, <16 x float>* %tmp4, align 4
1816  %tmp6 = fmul contract <16 x float> %tmp5, %tmp5
1817  %tmp7 = fadd contract <16 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1818  %tmp8 = bitcast float* %tmp3 to <16 x float>*
1819  store <16 x float> %tmp7, <16 x float>* %tmp8, align 4
1820  %tmp9 = add i64 %tmp, 16
1821  %tmp10 = icmp eq i64 %tmp9, 1024
1822  br i1 %tmp10, label %bb11, label %bb2
1823
1824bb11:                                             ; preds = %bb2
1825  ret void
1826}
1827
1828define void @bcast_unfold_fma231_v16f32(float* %arg) {
1829; CHECK-LABEL: bcast_unfold_fma231_v16f32:
1830; CHECK:       # %bb.0: # %bb
1831; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
1832; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1833; CHECK-NEXT:    .p2align 4, 0x90
1834; CHECK-NEXT:  .LBB53_1: # %bb1
1835; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1836; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
1837; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
1838; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
1839; CHECK-NEXT:    addq $64, %rax
1840; CHECK-NEXT:    jne .LBB53_1
1841; CHECK-NEXT:  # %bb.2: # %bb10
1842; CHECK-NEXT:    vzeroupper
1843; CHECK-NEXT:    retq
1844bb:
1845  br label %bb1
1846
1847bb1:                                              ; preds = %bb1, %bb
1848  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1849  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
1850  %tmp3 = bitcast float* %tmp2 to <16 x float>*
1851  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
1852  %tmp5 = fmul contract <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
1853  %tmp6 = fadd contract <16 x float> %tmp4, %tmp5
1854  %tmp7 = bitcast float* %tmp2 to <16 x float>*
1855  store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
1856  %tmp8 = add i64 %tmp, 16
1857  %tmp9 = icmp eq i64 %tmp8, 1024
1858  br i1 %tmp9, label %bb10, label %bb1
1859
1860bb10:                                             ; preds = %bb1
1861  ret void
1862}
1863
1864define void @bcast_unfold_fma213_v2f64(double* %arg) {
1865; CHECK-LABEL: bcast_unfold_fma213_v2f64:
1866; CHECK:       # %bb.0: # %bb
1867; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1868; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1869; CHECK-NEXT:    .p2align 4, 0x90
1870; CHECK-NEXT:  .LBB54_1: # %bb2
1871; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1872; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
1873; CHECK-NEXT:    vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0
1874; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
1875; CHECK-NEXT:    addq $16, %rax
1876; CHECK-NEXT:    jne .LBB54_1
1877; CHECK-NEXT:  # %bb.2: # %bb11
1878; CHECK-NEXT:    retq
1879bb:
1880  br label %bb2
1881
1882bb2:                                              ; preds = %bb2, %bb
1883  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1884  %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
1885  %tmp4 = bitcast double* %tmp3 to <2 x double>*
1886  %tmp5 = load <2 x double>, <2 x double>* %tmp4, align 4
1887  %tmp6 = fmul contract <2 x double> %tmp5, %tmp5
1888  %tmp7 = fadd contract <2 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00>
1889  %tmp8 = bitcast double* %tmp3 to <2 x double>*
1890  store <2 x double> %tmp7, <2 x double>* %tmp8, align 8
1891  %tmp9 = add i64 %tmp, 2
1892  %tmp10 = icmp eq i64 %tmp9, 1024
1893  br i1 %tmp10, label %bb11, label %bb2
1894
1895bb11:                                             ; preds = %bb2
1896  ret void
1897}
1898
1899define void @bcast_unfold_fma231_v2f64(double* %arg) {
1900; CHECK-LABEL: bcast_unfold_fma231_v2f64:
1901; CHECK:       # %bb.0: # %bb
1902; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1903; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
1904; CHECK-NEXT:    .p2align 4, 0x90
1905; CHECK-NEXT:  .LBB55_1: # %bb1
1906; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1907; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
1908; CHECK-NEXT:    vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1
1909; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
1910; CHECK-NEXT:    addq $16, %rax
1911; CHECK-NEXT:    jne .LBB55_1
1912; CHECK-NEXT:  # %bb.2: # %bb10
1913; CHECK-NEXT:    retq
1914bb:
1915  br label %bb1
1916
1917bb1:                                              ; preds = %bb1, %bb
1918  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1919  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1920  %tmp3 = bitcast double* %tmp2 to <2 x double>*
1921  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
1922  %tmp5 = fmul contract <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
1923  %tmp6 = fadd contract <2 x double> %tmp4, %tmp5
1924  %tmp7 = bitcast double* %tmp2 to <2 x double>*
1925  store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
1926  %tmp8 = add i64 %tmp, 2
1927  %tmp9 = icmp eq i64 %tmp8, 1024
1928  br i1 %tmp9, label %bb10, label %bb1
1929
1930bb10:                                             ; preds = %bb1
1931  ret void
1932}
1933
1934define void @bcast_unfold_fma213_v4f64(double* %arg) {
1935; CHECK-LABEL: bcast_unfold_fma213_v4f64:
1936; CHECK:       # %bb.0: # %bb
1937; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1938; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1939; CHECK-NEXT:    .p2align 4, 0x90
1940; CHECK-NEXT:  .LBB56_1: # %bb2
1941; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1942; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
1943; CHECK-NEXT:    vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0
1944; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
1945; CHECK-NEXT:    addq $32, %rax
1946; CHECK-NEXT:    jne .LBB56_1
1947; CHECK-NEXT:  # %bb.2: # %bb11
1948; CHECK-NEXT:    vzeroupper
1949; CHECK-NEXT:    retq
1950bb:
1951  br label %bb2
1952
1953bb2:                                              ; preds = %bb2, %bb
1954  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
1955  %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
1956  %tmp4 = bitcast double* %tmp3 to <4 x double>*
1957  %tmp5 = load <4 x double>, <4 x double>* %tmp4, align 8
1958  %tmp6 = fmul contract <4 x double> %tmp5, %tmp5
1959  %tmp7 = fadd contract <4 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1960  %tmp8 = bitcast double* %tmp3 to <4 x double>*
1961  store <4 x double> %tmp7, <4 x double>* %tmp8, align 8
1962  %tmp9 = add i64 %tmp, 4
1963  %tmp10 = icmp eq i64 %tmp9, 1024
1964  br i1 %tmp10, label %bb11, label %bb2
1965
1966bb11:                                             ; preds = %bb2
1967  ret void
1968}
1969
1970define void @bcast_unfold_fma231_v4f64(double* %arg) {
1971; CHECK-LABEL: bcast_unfold_fma231_v4f64:
1972; CHECK:       # %bb.0: # %bb
1973; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
1974; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
1975; CHECK-NEXT:    .p2align 4, 0x90
1976; CHECK-NEXT:  .LBB57_1: # %bb1
1977; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1978; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
1979; CHECK-NEXT:    vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1
1980; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
1981; CHECK-NEXT:    addq $32, %rax
1982; CHECK-NEXT:    jne .LBB57_1
1983; CHECK-NEXT:  # %bb.2: # %bb10
1984; CHECK-NEXT:    vzeroupper
1985; CHECK-NEXT:    retq
1986bb:
1987  br label %bb1
1988
1989bb1:                                              ; preds = %bb1, %bb
1990  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
1991  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
1992  %tmp3 = bitcast double* %tmp2 to <4 x double>*
1993  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
1994  %tmp5 = fmul contract <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
1995  %tmp6 = fadd contract <4 x double> %tmp4, %tmp5
1996  %tmp7 = bitcast double* %tmp2 to <4 x double>*
1997  store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
1998  %tmp8 = add i64 %tmp, 4
1999  %tmp9 = icmp eq i64 %tmp8, 1024
2000  br i1 %tmp9, label %bb10, label %bb1
2001
2002bb10:                                             ; preds = %bb1
2003  ret void
2004}
2005
2006define void @bcast_unfold_fma213_v8f64(double* %arg) {
2007; CHECK-LABEL: bcast_unfold_fma213_v8f64:
2008; CHECK:       # %bb.0: # %bb
2009; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2010; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2011; CHECK-NEXT:    .p2align 4, 0x90
2012; CHECK-NEXT:  .LBB58_1: # %bb2
2013; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2014; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
2015; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0
2016; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
2017; CHECK-NEXT:    addq $64, %rax
2018; CHECK-NEXT:    jne .LBB58_1
2019; CHECK-NEXT:  # %bb.2: # %bb11
2020; CHECK-NEXT:    vzeroupper
2021; CHECK-NEXT:    retq
2022bb:
2023  br label %bb2
2024
2025bb2:                                              ; preds = %bb2, %bb
2026  %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ]
2027  %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp
2028  %tmp4 = bitcast double* %tmp3 to <8 x double>*
2029  %tmp5 = load <8 x double>, <8 x double>* %tmp4, align 8
2030  %tmp6 = fmul contract <8 x double> %tmp5, %tmp5
2031  %tmp7 = fadd contract <8 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2032  %tmp8 = bitcast double* %tmp3 to <8 x double>*
2033  store <8 x double> %tmp7, <8 x double>* %tmp8, align 8
2034  %tmp9 = add i64 %tmp, 8
2035  %tmp10 = icmp eq i64 %tmp9, 1024
2036  br i1 %tmp10, label %bb11, label %bb2
2037
2038bb11:                                             ; preds = %bb2
2039  ret void
2040}
2041
2042define void @bcast_unfold_fma231_v8f64(double* %arg) {
2043; CHECK-LABEL: bcast_unfold_fma231_v8f64:
2044; CHECK:       # %bb.0: # %bb
2045; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2046; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2047; CHECK-NEXT:    .p2align 4, 0x90
2048; CHECK-NEXT:  .LBB59_1: # %bb1
2049; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2050; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
2051; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1
2052; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
2053; CHECK-NEXT:    addq $64, %rax
2054; CHECK-NEXT:    jne .LBB59_1
2055; CHECK-NEXT:  # %bb.2: # %bb10
2056; CHECK-NEXT:    vzeroupper
2057; CHECK-NEXT:    retq
2058bb:
2059  br label %bb1
2060
2061bb1:                                              ; preds = %bb1, %bb
2062  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2063  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2064  %tmp3 = bitcast double* %tmp2 to <8 x double>*
2065  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2066  %tmp5 = fmul contract <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2067  %tmp6 = fadd contract <8 x double> %tmp4, %tmp5
2068  %tmp7 = bitcast double* %tmp2 to <8 x double>*
2069  store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2070  %tmp8 = add i64 %tmp, 8
2071  %tmp9 = icmp eq i64 %tmp8, 1024
2072  br i1 %tmp9, label %bb10, label %bb1
2073
2074bb10:                                             ; preds = %bb1
2075  ret void
2076}
2077
2078define void @bcast_unfold_fmax_v4f32(float* %arg) {
2079; CHECK-LABEL: bcast_unfold_fmax_v4f32:
2080; CHECK:       # %bb.0: # %bb
2081; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2082; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2083; CHECK-NEXT:    .p2align 4, 0x90
2084; CHECK-NEXT:  .LBB60_1: # %bb1
2085; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2086; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
2087; CHECK-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
2088; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
2089; CHECK-NEXT:    addq $16, %rax
2090; CHECK-NEXT:    jne .LBB60_1
2091; CHECK-NEXT:  # %bb.2: # %bb10
2092; CHECK-NEXT:    retq
2093bb:
2094  br label %bb1
2095
2096bb1:                                              ; preds = %bb1, %bb
2097  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2098  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2099  %tmp3 = bitcast float* %tmp2 to <4 x float>*
2100  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
2101  %tmp5 = fcmp ogt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2102  %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2103  %tmp7 = bitcast float* %tmp2 to <4 x float>*
2104  store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
2105  %tmp8 = add i64 %tmp, 4
2106  %tmp9 = icmp eq i64 %tmp8, 1024
2107  br i1 %tmp9, label %bb10, label %bb1
2108
2109bb10:                                             ; preds = %bb1
2110  ret void
2111}
2112
2113define void @bcast_unfold_fmax_v8f32(float* %arg) {
2114; CHECK-LABEL: bcast_unfold_fmax_v8f32:
2115; CHECK:       # %bb.0: # %bb
2116; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2117; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2118; CHECK-NEXT:    .p2align 4, 0x90
2119; CHECK-NEXT:  .LBB61_1: # %bb1
2120; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2121; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
2122; CHECK-NEXT:    vmaxps %ymm0, %ymm1, %ymm1
2123; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
2124; CHECK-NEXT:    addq $32, %rax
2125; CHECK-NEXT:    jne .LBB61_1
2126; CHECK-NEXT:  # %bb.2: # %bb10
2127; CHECK-NEXT:    vzeroupper
2128; CHECK-NEXT:    retq
2129bb:
2130  br label %bb1
2131
2132bb1:                                              ; preds = %bb1, %bb
2133  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2134  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2135  %tmp3 = bitcast float* %tmp2 to <8 x float>*
2136  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
2137  %tmp5 = fcmp ogt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2138  %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2139  %tmp7 = bitcast float* %tmp2 to <8 x float>*
2140  store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
2141  %tmp8 = add i64 %tmp, 8
2142  %tmp9 = icmp eq i64 %tmp8, 1024
2143  br i1 %tmp9, label %bb10, label %bb1
2144
2145bb10:                                             ; preds = %bb1
2146  ret void
2147}
2148
2149define void @bcast_unfold_fmax_v16f32(float* %arg) {
2150; CHECK-LABEL: bcast_unfold_fmax_v16f32:
2151; CHECK:       # %bb.0: # %bb
2152; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2153; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2154; CHECK-NEXT:    .p2align 4, 0x90
2155; CHECK-NEXT:  .LBB62_1: # %bb1
2156; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2157; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
2158; CHECK-NEXT:    vmaxps %zmm0, %zmm1, %zmm1
2159; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
2160; CHECK-NEXT:    addq $64, %rax
2161; CHECK-NEXT:    jne .LBB62_1
2162; CHECK-NEXT:  # %bb.2: # %bb10
2163; CHECK-NEXT:    vzeroupper
2164; CHECK-NEXT:    retq
2165bb:
2166  br label %bb1
2167
2168bb1:                                              ; preds = %bb1, %bb
2169  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2170  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2171  %tmp3 = bitcast float* %tmp2 to <16 x float>*
2172  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
2173  %tmp5 = fcmp ogt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2174  %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2175  %tmp7 = bitcast float* %tmp2 to <16 x float>*
2176  store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
2177  %tmp8 = add i64 %tmp, 16
2178  %tmp9 = icmp eq i64 %tmp8, 1024
2179  br i1 %tmp9, label %bb10, label %bb1
2180
2181bb10:                                             ; preds = %bb1
2182  ret void
2183}
2184
2185define void @bcast_unfold_fmax_v2f64(double* %arg) {
2186; CHECK-LABEL: bcast_unfold_fmax_v2f64:
2187; CHECK:       # %bb.0: # %bb
2188; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2189; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2190; CHECK-NEXT:    .p2align 4, 0x90
2191; CHECK-NEXT:  .LBB63_1: # %bb1
2192; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2193; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
2194; CHECK-NEXT:    vmaxpd %xmm0, %xmm1, %xmm1
2195; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
2196; CHECK-NEXT:    addq $16, %rax
2197; CHECK-NEXT:    jne .LBB63_1
2198; CHECK-NEXT:  # %bb.2: # %bb10
2199; CHECK-NEXT:    retq
2200bb:
2201  br label %bb1
2202
2203bb1:                                              ; preds = %bb1, %bb
2204  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2205  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2206  %tmp3 = bitcast double* %tmp2 to <2 x double>*
2207  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
2208  %tmp5 = fcmp ogt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2209  %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2210  %tmp7 = bitcast double* %tmp2 to <2 x double>*
2211  store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
2212  %tmp8 = add i64 %tmp, 2
2213  %tmp9 = icmp eq i64 %tmp8, 1024
2214  br i1 %tmp9, label %bb10, label %bb1
2215
2216bb10:                                             ; preds = %bb1
2217  ret void
2218}
2219
2220define void @bcast_unfold_fmax_v4f64(double* %arg) {
2221; CHECK-LABEL: bcast_unfold_fmax_v4f64:
2222; CHECK:       # %bb.0: # %bb
2223; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2224; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2225; CHECK-NEXT:    .p2align 4, 0x90
2226; CHECK-NEXT:  .LBB64_1: # %bb1
2227; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2228; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
2229; CHECK-NEXT:    vmaxpd %ymm0, %ymm1, %ymm1
2230; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
2231; CHECK-NEXT:    addq $32, %rax
2232; CHECK-NEXT:    jne .LBB64_1
2233; CHECK-NEXT:  # %bb.2: # %bb10
2234; CHECK-NEXT:    vzeroupper
2235; CHECK-NEXT:    retq
2236bb:
2237  br label %bb1
2238
2239bb1:                                              ; preds = %bb1, %bb
2240  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2241  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2242  %tmp3 = bitcast double* %tmp2 to <4 x double>*
2243  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
2244  %tmp5 = fcmp ogt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2245  %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2246  %tmp7 = bitcast double* %tmp2 to <4 x double>*
2247  store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
2248  %tmp8 = add i64 %tmp, 4
2249  %tmp9 = icmp eq i64 %tmp8, 1024
2250  br i1 %tmp9, label %bb10, label %bb1
2251
2252bb10:                                             ; preds = %bb1
2253  ret void
2254}
2255
2256define void @bcast_unfold_fmax_v8f64(double* %arg) {
2257; CHECK-LABEL: bcast_unfold_fmax_v8f64:
2258; CHECK:       # %bb.0: # %bb
2259; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2260; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2261; CHECK-NEXT:    .p2align 4, 0x90
2262; CHECK-NEXT:  .LBB65_1: # %bb1
2263; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2264; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
2265; CHECK-NEXT:    vmaxpd %zmm0, %zmm1, %zmm1
2266; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
2267; CHECK-NEXT:    addq $64, %rax
2268; CHECK-NEXT:    jne .LBB65_1
2269; CHECK-NEXT:  # %bb.2: # %bb10
2270; CHECK-NEXT:    vzeroupper
2271; CHECK-NEXT:    retq
2272bb:
2273  br label %bb1
2274
2275bb1:                                              ; preds = %bb1, %bb
2276  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2277  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2278  %tmp3 = bitcast double* %tmp2 to <8 x double>*
2279  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2280  %tmp5 = fcmp ogt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2281  %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2282  %tmp7 = bitcast double* %tmp2 to <8 x double>*
2283  store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2284  %tmp8 = add i64 %tmp, 8
2285  %tmp9 = icmp eq i64 %tmp8, 1024
2286  br i1 %tmp9, label %bb10, label %bb1
2287
2288bb10:                                             ; preds = %bb1
2289  ret void
2290}
2291
2292define void @bcast_unfold_fmin_v4f32(float* %arg) {
2293; CHECK-LABEL: bcast_unfold_fmin_v4f32:
2294; CHECK:       # %bb.0: # %bb
2295; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2296; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2297; CHECK-NEXT:    .p2align 4, 0x90
2298; CHECK-NEXT:  .LBB66_1: # %bb1
2299; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2300; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm1
2301; CHECK-NEXT:    vminps %xmm0, %xmm1, %xmm1
2302; CHECK-NEXT:    vmovups %xmm1, 4096(%rdi,%rax)
2303; CHECK-NEXT:    addq $16, %rax
2304; CHECK-NEXT:    jne .LBB66_1
2305; CHECK-NEXT:  # %bb.2: # %bb10
2306; CHECK-NEXT:    retq
2307bb:
2308  br label %bb1
2309
2310bb1:                                              ; preds = %bb1, %bb
2311  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2312  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2313  %tmp3 = bitcast float* %tmp2 to <4 x float>*
2314  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
2315  %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2316  %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2317  %tmp7 = bitcast float* %tmp2 to <4 x float>*
2318  store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
2319  %tmp8 = add i64 %tmp, 4
2320  %tmp9 = icmp eq i64 %tmp8, 1024
2321  br i1 %tmp9, label %bb10, label %bb1
2322
2323bb10:                                             ; preds = %bb1
2324  ret void
2325}
2326
2327define void @bcast_unfold_fmin_v8f32(float* %arg) {
2328; CHECK-LABEL: bcast_unfold_fmin_v8f32:
2329; CHECK:       # %bb.0: # %bb
2330; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2331; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2332; CHECK-NEXT:    .p2align 4, 0x90
2333; CHECK-NEXT:  .LBB67_1: # %bb1
2334; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2335; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm1
2336; CHECK-NEXT:    vminps %ymm0, %ymm1, %ymm1
2337; CHECK-NEXT:    vmovups %ymm1, 4096(%rdi,%rax)
2338; CHECK-NEXT:    addq $32, %rax
2339; CHECK-NEXT:    jne .LBB67_1
2340; CHECK-NEXT:  # %bb.2: # %bb10
2341; CHECK-NEXT:    vzeroupper
2342; CHECK-NEXT:    retq
2343bb:
2344  br label %bb1
2345
2346bb1:                                              ; preds = %bb1, %bb
2347  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2348  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2349  %tmp3 = bitcast float* %tmp2 to <8 x float>*
2350  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
2351  %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2352  %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2353  %tmp7 = bitcast float* %tmp2 to <8 x float>*
2354  store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
2355  %tmp8 = add i64 %tmp, 8
2356  %tmp9 = icmp eq i64 %tmp8, 1024
2357  br i1 %tmp9, label %bb10, label %bb1
2358
2359bb10:                                             ; preds = %bb1
2360  ret void
2361}
2362
2363define void @bcast_unfold_fmin_v16f32(float* %arg) {
2364; CHECK-LABEL: bcast_unfold_fmin_v16f32:
2365; CHECK:       # %bb.0: # %bb
2366; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2367; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2368; CHECK-NEXT:    .p2align 4, 0x90
2369; CHECK-NEXT:  .LBB68_1: # %bb1
2370; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2371; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm1
2372; CHECK-NEXT:    vminps %zmm0, %zmm1, %zmm1
2373; CHECK-NEXT:    vmovups %zmm1, 4096(%rdi,%rax)
2374; CHECK-NEXT:    addq $64, %rax
2375; CHECK-NEXT:    jne .LBB68_1
2376; CHECK-NEXT:  # %bb.2: # %bb10
2377; CHECK-NEXT:    vzeroupper
2378; CHECK-NEXT:    retq
2379bb:
2380  br label %bb1
2381
2382bb1:                                              ; preds = %bb1, %bb
2383  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2384  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
2385  %tmp3 = bitcast float* %tmp2 to <16 x float>*
2386  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
2387  %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2388  %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
2389  %tmp7 = bitcast float* %tmp2 to <16 x float>*
2390  store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
2391  %tmp8 = add i64 %tmp, 16
2392  %tmp9 = icmp eq i64 %tmp8, 1024
2393  br i1 %tmp9, label %bb10, label %bb1
2394
2395bb10:                                             ; preds = %bb1
2396  ret void
2397}
2398
2399define void @bcast_unfold_fmin_v2f64(double* %arg) {
2400; CHECK-LABEL: bcast_unfold_fmin_v2f64:
2401; CHECK:       # %bb.0: # %bb
2402; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2403; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
2404; CHECK-NEXT:    .p2align 4, 0x90
2405; CHECK-NEXT:  .LBB69_1: # %bb1
2406; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2407; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm1
2408; CHECK-NEXT:    vminpd %xmm0, %xmm1, %xmm1
2409; CHECK-NEXT:    vmovupd %xmm1, 8192(%rdi,%rax)
2410; CHECK-NEXT:    addq $16, %rax
2411; CHECK-NEXT:    jne .LBB69_1
2412; CHECK-NEXT:  # %bb.2: # %bb10
2413; CHECK-NEXT:    retq
2414bb:
2415  br label %bb1
2416
2417bb1:                                              ; preds = %bb1, %bb
2418  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2419  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2420  %tmp3 = bitcast double* %tmp2 to <2 x double>*
2421  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
2422  %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
2423  %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00>
2424  %tmp7 = bitcast double* %tmp2 to <2 x double>*
2425  store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
2426  %tmp8 = add i64 %tmp, 2
2427  %tmp9 = icmp eq i64 %tmp8, 1024
2428  br i1 %tmp9, label %bb10, label %bb1
2429
2430bb10:                                             ; preds = %bb1
2431  ret void
2432}
2433
2434define void @bcast_unfold_fmin_v4f64(double* %arg) {
2435; CHECK-LABEL: bcast_unfold_fmin_v4f64:
2436; CHECK:       # %bb.0: # %bb
2437; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2438; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2439; CHECK-NEXT:    .p2align 4, 0x90
2440; CHECK-NEXT:  .LBB70_1: # %bb1
2441; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2442; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm1
2443; CHECK-NEXT:    vminpd %ymm0, %ymm1, %ymm1
2444; CHECK-NEXT:    vmovupd %ymm1, 8192(%rdi,%rax)
2445; CHECK-NEXT:    addq $32, %rax
2446; CHECK-NEXT:    jne .LBB70_1
2447; CHECK-NEXT:  # %bb.2: # %bb10
2448; CHECK-NEXT:    vzeroupper
2449; CHECK-NEXT:    retq
2450bb:
2451  br label %bb1
2452
2453bb1:                                              ; preds = %bb1, %bb
2454  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2455  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2456  %tmp3 = bitcast double* %tmp2 to <4 x double>*
2457  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
2458  %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2459  %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2460  %tmp7 = bitcast double* %tmp2 to <4 x double>*
2461  store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
2462  %tmp8 = add i64 %tmp, 4
2463  %tmp9 = icmp eq i64 %tmp8, 1024
2464  br i1 %tmp9, label %bb10, label %bb1
2465
2466bb10:                                             ; preds = %bb1
2467  ret void
2468}
2469
2470define void @bcast_unfold_fmin_v8f64(double* %arg) {
2471; CHECK-LABEL: bcast_unfold_fmin_v8f64:
2472; CHECK:       # %bb.0: # %bb
2473; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2474; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
2475; CHECK-NEXT:    .p2align 4, 0x90
2476; CHECK-NEXT:  .LBB71_1: # %bb1
2477; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2478; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm1
2479; CHECK-NEXT:    vminpd %zmm0, %zmm1, %zmm1
2480; CHECK-NEXT:    vmovupd %zmm1, 8192(%rdi,%rax)
2481; CHECK-NEXT:    addq $64, %rax
2482; CHECK-NEXT:    jne .LBB71_1
2483; CHECK-NEXT:  # %bb.2: # %bb10
2484; CHECK-NEXT:    vzeroupper
2485; CHECK-NEXT:    retq
2486bb:
2487  br label %bb1
2488
2489bb1:                                              ; preds = %bb1, %bb
2490  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2491  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
2492  %tmp3 = bitcast double* %tmp2 to <8 x double>*
2493  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
2494  %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2495  %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
2496  %tmp7 = bitcast double* %tmp2 to <8 x double>*
2497  store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
2498  %tmp8 = add i64 %tmp, 8
2499  %tmp9 = icmp eq i64 %tmp8, 1024
2500  br i1 %tmp9, label %bb10, label %bb1
2501
2502bb10:                                             ; preds = %bb1
2503  ret void
2504}
2505
2506define void @bcast_unfold_smin_v4i32(i32* %arg) {
2507; CHECK-LABEL: bcast_unfold_smin_v4i32:
2508; CHECK:       # %bb.0: # %bb
2509; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2510; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2511; CHECK-NEXT:    .p2align 4, 0x90
2512; CHECK-NEXT:  .LBB72_1: # %bb1
2513; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2514; CHECK-NEXT:    vpminsd 4096(%rdi,%rax), %xmm0, %xmm1
2515; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
2516; CHECK-NEXT:    addq $16, %rax
2517; CHECK-NEXT:    jne .LBB72_1
2518; CHECK-NEXT:  # %bb.2: # %bb10
2519; CHECK-NEXT:    retq
2520bb:
2521  br label %bb1
2522
2523bb1:                                              ; preds = %bb1, %bb
2524  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2525  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2526  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2527  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2528  %tmp5 = icmp slt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2529  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2530  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2531  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2532  %tmp8 = add i64 %tmp, 4
2533  %tmp9 = icmp eq i64 %tmp8, 1024
2534  br i1 %tmp9, label %bb10, label %bb1
2535
2536bb10:                                             ; preds = %bb1
2537  ret void
2538}
2539
2540define void @bcast_unfold_smin_v8i32(i32* %arg) {
2541; CHECK-LABEL: bcast_unfold_smin_v8i32:
2542; CHECK:       # %bb.0: # %bb
2543; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2544; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2545; CHECK-NEXT:    .p2align 4, 0x90
2546; CHECK-NEXT:  .LBB73_1: # %bb1
2547; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2548; CHECK-NEXT:    vpminsd 4096(%rdi,%rax), %ymm0, %ymm1
2549; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
2550; CHECK-NEXT:    addq $32, %rax
2551; CHECK-NEXT:    jne .LBB73_1
2552; CHECK-NEXT:  # %bb.2: # %bb10
2553; CHECK-NEXT:    vzeroupper
2554; CHECK-NEXT:    retq
2555bb:
2556  br label %bb1
2557
2558bb1:                                              ; preds = %bb1, %bb
2559  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2560  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2561  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2562  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2563  %tmp5 = icmp slt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2564  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2565  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2566  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2567  %tmp8 = add i64 %tmp, 8
2568  %tmp9 = icmp eq i64 %tmp8, 1024
2569  br i1 %tmp9, label %bb10, label %bb1
2570
2571bb10:                                             ; preds = %bb1
2572  ret void
2573}
2574
2575define void @bcast_unfold_smin_v16i32(i32* %arg) {
2576; CHECK-LABEL: bcast_unfold_smin_v16i32:
2577; CHECK:       # %bb.0: # %bb
2578; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2579; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2580; CHECK-NEXT:    .p2align 4, 0x90
2581; CHECK-NEXT:  .LBB74_1: # %bb1
2582; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2583; CHECK-NEXT:    vpminsd 4096(%rdi,%rax), %zmm0, %zmm1
2584; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
2585; CHECK-NEXT:    addq $64, %rax
2586; CHECK-NEXT:    jne .LBB74_1
2587; CHECK-NEXT:  # %bb.2: # %bb10
2588; CHECK-NEXT:    vzeroupper
2589; CHECK-NEXT:    retq
2590bb:
2591  br label %bb1
2592
2593bb1:                                              ; preds = %bb1, %bb
2594  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2595  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2596  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
2597  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
2598  %tmp5 = icmp slt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2599  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2600  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
2601  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
2602  %tmp8 = add i64 %tmp, 16
2603  %tmp9 = icmp eq i64 %tmp8, 1024
2604  br i1 %tmp9, label %bb10, label %bb1
2605
2606bb10:                                             ; preds = %bb1
2607  ret void
2608}
2609
2610define void @bcast_unfold_smin_v2i64(i64* %arg) {
2611; CHECK-LABEL: bcast_unfold_smin_v2i64:
2612; CHECK:       # %bb.0: # %bb
2613; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2614; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,2]
2615; CHECK-NEXT:    .p2align 4, 0x90
2616; CHECK-NEXT:  .LBB75_1: # %bb1
2617; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2618; CHECK-NEXT:    vpminsq 8192(%rdi,%rax), %xmm0, %xmm1
2619; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
2620; CHECK-NEXT:    addq $16, %rax
2621; CHECK-NEXT:    jne .LBB75_1
2622; CHECK-NEXT:  # %bb.2: # %bb10
2623; CHECK-NEXT:    retq
2624bb:
2625  br label %bb1
2626
2627bb1:                                              ; preds = %bb1, %bb
2628  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2629  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2630  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
2631  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
2632  %tmp5 = icmp slt <2 x i64> %tmp4, <i64 2, i64 2>
2633  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2634  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
2635  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
2636  %tmp8 = add i64 %tmp, 2
2637  %tmp9 = icmp eq i64 %tmp8, 1024
2638  br i1 %tmp9, label %bb10, label %bb1
2639
2640bb10:                                             ; preds = %bb1
2641  ret void
2642}
2643
2644define void @bcast_unfold_smin_v4i64(i64* %arg) {
2645; CHECK-LABEL: bcast_unfold_smin_v4i64:
2646; CHECK:       # %bb.0: # %bb
2647; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2648; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2649; CHECK-NEXT:    .p2align 4, 0x90
2650; CHECK-NEXT:  .LBB76_1: # %bb1
2651; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2652; CHECK-NEXT:    vpminsq 8192(%rdi,%rax), %ymm0, %ymm1
2653; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
2654; CHECK-NEXT:    addq $32, %rax
2655; CHECK-NEXT:    jne .LBB76_1
2656; CHECK-NEXT:  # %bb.2: # %bb10
2657; CHECK-NEXT:    vzeroupper
2658; CHECK-NEXT:    retq
2659bb:
2660  br label %bb1
2661
2662bb1:                                              ; preds = %bb1, %bb
2663  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2664  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2665  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
2666  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
2667  %tmp5 = icmp slt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2668  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2669  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
2670  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
2671  %tmp8 = add i64 %tmp, 4
2672  %tmp9 = icmp eq i64 %tmp8, 1024
2673  br i1 %tmp9, label %bb10, label %bb1
2674
2675bb10:                                             ; preds = %bb1
2676  ret void
2677}
2678
2679define void @bcast_unfold_smin_v8i64(i64* %arg) {
2680; CHECK-LABEL: bcast_unfold_smin_v8i64:
2681; CHECK:       # %bb.0: # %bb
2682; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2683; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2684; CHECK-NEXT:    .p2align 4, 0x90
2685; CHECK-NEXT:  .LBB77_1: # %bb1
2686; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2687; CHECK-NEXT:    vpminsq 8192(%rdi,%rax), %zmm0, %zmm1
2688; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
2689; CHECK-NEXT:    addq $64, %rax
2690; CHECK-NEXT:    jne .LBB77_1
2691; CHECK-NEXT:  # %bb.2: # %bb10
2692; CHECK-NEXT:    vzeroupper
2693; CHECK-NEXT:    retq
2694bb:
2695  br label %bb1
2696
2697bb1:                                              ; preds = %bb1, %bb
2698  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2699  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2700  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
2701  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
2702  %tmp5 = icmp slt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2703  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2704  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
2705  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
2706  %tmp8 = add i64 %tmp, 8
2707  %tmp9 = icmp eq i64 %tmp8, 1024
2708  br i1 %tmp9, label %bb10, label %bb1
2709
2710bb10:                                             ; preds = %bb1
2711  ret void
2712}
2713
2714define void @bcast_unfold_smax_v4i32(i32* %arg) {
2715; CHECK-LABEL: bcast_unfold_smax_v4i32:
2716; CHECK:       # %bb.0: # %bb
2717; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2718; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2719; CHECK-NEXT:    .p2align 4, 0x90
2720; CHECK-NEXT:  .LBB78_1: # %bb1
2721; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2722; CHECK-NEXT:    vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1
2723; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
2724; CHECK-NEXT:    addq $16, %rax
2725; CHECK-NEXT:    jne .LBB78_1
2726; CHECK-NEXT:  # %bb.2: # %bb10
2727; CHECK-NEXT:    retq
2728bb:
2729  br label %bb1
2730
2731bb1:                                              ; preds = %bb1, %bb
2732  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2733  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2734  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2735  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2736  %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2737  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2738  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2739  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2740  %tmp8 = add i64 %tmp, 4
2741  %tmp9 = icmp eq i64 %tmp8, 1024
2742  br i1 %tmp9, label %bb10, label %bb1
2743
2744bb10:                                             ; preds = %bb1
2745  ret void
2746}
2747
2748define void @bcast_unfold_smax_v8i32(i32* %arg) {
2749; CHECK-LABEL: bcast_unfold_smax_v8i32:
2750; CHECK:       # %bb.0: # %bb
2751; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2752; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2753; CHECK-NEXT:    .p2align 4, 0x90
2754; CHECK-NEXT:  .LBB79_1: # %bb1
2755; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2756; CHECK-NEXT:    vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1
2757; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
2758; CHECK-NEXT:    addq $32, %rax
2759; CHECK-NEXT:    jne .LBB79_1
2760; CHECK-NEXT:  # %bb.2: # %bb10
2761; CHECK-NEXT:    vzeroupper
2762; CHECK-NEXT:    retq
2763bb:
2764  br label %bb1
2765
2766bb1:                                              ; preds = %bb1, %bb
2767  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2768  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2769  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2770  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2771  %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2772  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2773  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2774  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2775  %tmp8 = add i64 %tmp, 8
2776  %tmp9 = icmp eq i64 %tmp8, 1024
2777  br i1 %tmp9, label %bb10, label %bb1
2778
2779bb10:                                             ; preds = %bb1
2780  ret void
2781}
2782
2783define void @bcast_unfold_smax_v16i32(i32* %arg) {
2784; CHECK-LABEL: bcast_unfold_smax_v16i32:
2785; CHECK:       # %bb.0: # %bb
2786; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2787; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2788; CHECK-NEXT:    .p2align 4, 0x90
2789; CHECK-NEXT:  .LBB80_1: # %bb1
2790; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2791; CHECK-NEXT:    vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1
2792; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
2793; CHECK-NEXT:    addq $64, %rax
2794; CHECK-NEXT:    jne .LBB80_1
2795; CHECK-NEXT:  # %bb.2: # %bb10
2796; CHECK-NEXT:    vzeroupper
2797; CHECK-NEXT:    retq
2798bb:
2799  br label %bb1
2800
2801bb1:                                              ; preds = %bb1, %bb
2802  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2803  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2804  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
2805  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
2806  %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2807  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2808  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
2809  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
2810  %tmp8 = add i64 %tmp, 16
2811  %tmp9 = icmp eq i64 %tmp8, 1024
2812  br i1 %tmp9, label %bb10, label %bb1
2813
2814bb10:                                             ; preds = %bb1
2815  ret void
2816}
2817
2818define void @bcast_unfold_smax_v2i64(i64* %arg) {
2819; CHECK-LABEL: bcast_unfold_smax_v2i64:
2820; CHECK:       # %bb.0: # %bb
2821; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2822; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,2]
2823; CHECK-NEXT:    .p2align 4, 0x90
2824; CHECK-NEXT:  .LBB81_1: # %bb1
2825; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2826; CHECK-NEXT:    vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1
2827; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
2828; CHECK-NEXT:    addq $16, %rax
2829; CHECK-NEXT:    jne .LBB81_1
2830; CHECK-NEXT:  # %bb.2: # %bb10
2831; CHECK-NEXT:    retq
2832bb:
2833  br label %bb1
2834
2835bb1:                                              ; preds = %bb1, %bb
2836  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2837  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2838  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
2839  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
2840  %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 2, i64 2>
2841  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
2842  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
2843  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
2844  %tmp8 = add i64 %tmp, 2
2845  %tmp9 = icmp eq i64 %tmp8, 1024
2846  br i1 %tmp9, label %bb10, label %bb1
2847
2848bb10:                                             ; preds = %bb1
2849  ret void
2850}
2851
2852define void @bcast_unfold_smax_v4i64(i64* %arg) {
2853; CHECK-LABEL: bcast_unfold_smax_v4i64:
2854; CHECK:       # %bb.0: # %bb
2855; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2856; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
2857; CHECK-NEXT:    .p2align 4, 0x90
2858; CHECK-NEXT:  .LBB82_1: # %bb1
2859; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2860; CHECK-NEXT:    vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1
2861; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
2862; CHECK-NEXT:    addq $32, %rax
2863; CHECK-NEXT:    jne .LBB82_1
2864; CHECK-NEXT:  # %bb.2: # %bb10
2865; CHECK-NEXT:    vzeroupper
2866; CHECK-NEXT:    retq
2867bb:
2868  br label %bb1
2869
2870bb1:                                              ; preds = %bb1, %bb
2871  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2872  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2873  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
2874  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
2875  %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
2876  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
2877  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
2878  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
2879  %tmp8 = add i64 %tmp, 4
2880  %tmp9 = icmp eq i64 %tmp8, 1024
2881  br i1 %tmp9, label %bb10, label %bb1
2882
2883bb10:                                             ; preds = %bb1
2884  ret void
2885}
2886
2887define void @bcast_unfold_smax_v8i64(i64* %arg) {
2888; CHECK-LABEL: bcast_unfold_smax_v8i64:
2889; CHECK:       # %bb.0: # %bb
2890; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
2891; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
2892; CHECK-NEXT:    .p2align 4, 0x90
2893; CHECK-NEXT:  .LBB83_1: # %bb1
2894; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2895; CHECK-NEXT:    vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1
2896; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
2897; CHECK-NEXT:    addq $64, %rax
2898; CHECK-NEXT:    jne .LBB83_1
2899; CHECK-NEXT:  # %bb.2: # %bb10
2900; CHECK-NEXT:    vzeroupper
2901; CHECK-NEXT:    retq
2902bb:
2903  br label %bb1
2904
2905bb1:                                              ; preds = %bb1, %bb
2906  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2907  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
2908  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
2909  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
2910  %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2911  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
2912  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
2913  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
2914  %tmp8 = add i64 %tmp, 8
2915  %tmp9 = icmp eq i64 %tmp8, 1024
2916  br i1 %tmp9, label %bb10, label %bb1
2917
2918bb10:                                             ; preds = %bb1
2919  ret void
2920}
2921
2922define void @bcast_unfold_umin_v4i32(i32* %arg) {
2923; CHECK-LABEL: bcast_unfold_umin_v4i32:
2924; CHECK:       # %bb.0: # %bb
2925; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2926; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
2927; CHECK-NEXT:    .p2align 4, 0x90
2928; CHECK-NEXT:  .LBB84_1: # %bb1
2929; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2930; CHECK-NEXT:    vpminud 4096(%rdi,%rax), %xmm0, %xmm1
2931; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
2932; CHECK-NEXT:    addq $16, %rax
2933; CHECK-NEXT:    jne .LBB84_1
2934; CHECK-NEXT:  # %bb.2: # %bb10
2935; CHECK-NEXT:    retq
2936bb:
2937  br label %bb1
2938
2939bb1:                                              ; preds = %bb1, %bb
2940  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2941  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2942  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
2943  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
2944  %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
2945  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
2946  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
2947  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
2948  %tmp8 = add i64 %tmp, 4
2949  %tmp9 = icmp eq i64 %tmp8, 1024
2950  br i1 %tmp9, label %bb10, label %bb1
2951
2952bb10:                                             ; preds = %bb1
2953  ret void
2954}
2955
2956define void @bcast_unfold_umin_v8i32(i32* %arg) {
2957; CHECK-LABEL: bcast_unfold_umin_v8i32:
2958; CHECK:       # %bb.0: # %bb
2959; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2960; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
2961; CHECK-NEXT:    .p2align 4, 0x90
2962; CHECK-NEXT:  .LBB85_1: # %bb1
2963; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2964; CHECK-NEXT:    vpminud 4096(%rdi,%rax), %ymm0, %ymm1
2965; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
2966; CHECK-NEXT:    addq $32, %rax
2967; CHECK-NEXT:    jne .LBB85_1
2968; CHECK-NEXT:  # %bb.2: # %bb10
2969; CHECK-NEXT:    vzeroupper
2970; CHECK-NEXT:    retq
2971bb:
2972  br label %bb1
2973
2974bb1:                                              ; preds = %bb1, %bb
2975  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
2976  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
2977  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
2978  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
2979  %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2980  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2981  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
2982  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
2983  %tmp8 = add i64 %tmp, 8
2984  %tmp9 = icmp eq i64 %tmp8, 1024
2985  br i1 %tmp9, label %bb10, label %bb1
2986
2987bb10:                                             ; preds = %bb1
2988  ret void
2989}
2990
2991define void @bcast_unfold_umin_v16i32(i32* %arg) {
2992; CHECK-LABEL: bcast_unfold_umin_v16i32:
2993; CHECK:       # %bb.0: # %bb
2994; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
2995; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2996; CHECK-NEXT:    .p2align 4, 0x90
2997; CHECK-NEXT:  .LBB86_1: # %bb1
2998; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
2999; CHECK-NEXT:    vpminud 4096(%rdi,%rax), %zmm0, %zmm1
3000; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
3001; CHECK-NEXT:    addq $64, %rax
3002; CHECK-NEXT:    jne .LBB86_1
3003; CHECK-NEXT:  # %bb.2: # %bb10
3004; CHECK-NEXT:    vzeroupper
3005; CHECK-NEXT:    retq
3006bb:
3007  br label %bb1
3008
3009bb1:                                              ; preds = %bb1, %bb
3010  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3011  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3012  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3013  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3014  %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3015  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3016  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3017  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3018  %tmp8 = add i64 %tmp, 16
3019  %tmp9 = icmp eq i64 %tmp8, 1024
3020  br i1 %tmp9, label %bb10, label %bb1
3021
3022bb10:                                             ; preds = %bb1
3023  ret void
3024}
3025
3026define void @bcast_unfold_umin_v2i64(i64* %arg) {
3027; CHECK-LABEL: bcast_unfold_umin_v2i64:
3028; CHECK:       # %bb.0: # %bb
3029; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3030; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,2]
3031; CHECK-NEXT:    .p2align 4, 0x90
3032; CHECK-NEXT:  .LBB87_1: # %bb1
3033; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3034; CHECK-NEXT:    vpminuq 8192(%rdi,%rax), %xmm0, %xmm1
3035; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
3036; CHECK-NEXT:    addq $16, %rax
3037; CHECK-NEXT:    jne .LBB87_1
3038; CHECK-NEXT:  # %bb.2: # %bb10
3039; CHECK-NEXT:    retq
3040bb:
3041  br label %bb1
3042
3043bb1:                                              ; preds = %bb1, %bb
3044  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3045  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3046  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3047  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
3048  %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
3049  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
3050  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3051  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
3052  %tmp8 = add i64 %tmp, 2
3053  %tmp9 = icmp eq i64 %tmp8, 1024
3054  br i1 %tmp9, label %bb10, label %bb1
3055
3056bb10:                                             ; preds = %bb1
3057  ret void
3058}
3059
3060define void @bcast_unfold_umin_v4i64(i64* %arg) {
3061; CHECK-LABEL: bcast_unfold_umin_v4i64:
3062; CHECK:       # %bb.0: # %bb
3063; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3064; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3065; CHECK-NEXT:    .p2align 4, 0x90
3066; CHECK-NEXT:  .LBB88_1: # %bb1
3067; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3068; CHECK-NEXT:    vpminuq 8192(%rdi,%rax), %ymm0, %ymm1
3069; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
3070; CHECK-NEXT:    addq $32, %rax
3071; CHECK-NEXT:    jne .LBB88_1
3072; CHECK-NEXT:  # %bb.2: # %bb10
3073; CHECK-NEXT:    vzeroupper
3074; CHECK-NEXT:    retq
3075bb:
3076  br label %bb1
3077
3078bb1:                                              ; preds = %bb1, %bb
3079  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3080  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3081  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3082  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
3083  %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3084  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
3085  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3086  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
3087  %tmp8 = add i64 %tmp, 4
3088  %tmp9 = icmp eq i64 %tmp8, 1024
3089  br i1 %tmp9, label %bb10, label %bb1
3090
3091bb10:                                             ; preds = %bb1
3092  ret void
3093}
3094
3095define void @bcast_unfold_umin_v8i64(i64* %arg) {
3096; CHECK-LABEL: bcast_unfold_umin_v8i64:
3097; CHECK:       # %bb.0: # %bb
3098; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3099; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3100; CHECK-NEXT:    .p2align 4, 0x90
3101; CHECK-NEXT:  .LBB89_1: # %bb1
3102; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3103; CHECK-NEXT:    vpminuq 8192(%rdi,%rax), %zmm0, %zmm1
3104; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
3105; CHECK-NEXT:    addq $64, %rax
3106; CHECK-NEXT:    jne .LBB89_1
3107; CHECK-NEXT:  # %bb.2: # %bb10
3108; CHECK-NEXT:    vzeroupper
3109; CHECK-NEXT:    retq
3110bb:
3111  br label %bb1
3112
3113bb1:                                              ; preds = %bb1, %bb
3114  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3115  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3116  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3117  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
3118  %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3119  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3120  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3121  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
3122  %tmp8 = add i64 %tmp, 8
3123  %tmp9 = icmp eq i64 %tmp8, 1024
3124  br i1 %tmp9, label %bb10, label %bb1
3125
3126bb10:                                             ; preds = %bb1
3127  ret void
3128}
3129
3130define void @bcast_unfold_umax_v4i32(i32* %arg) {
3131; CHECK-LABEL: bcast_unfold_umax_v4i32:
3132; CHECK:       # %bb.0: # %bb
3133; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3134; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
3135; CHECK-NEXT:    .p2align 4, 0x90
3136; CHECK-NEXT:  .LBB90_1: # %bb1
3137; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3138; CHECK-NEXT:    vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1
3139; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
3140; CHECK-NEXT:    addq $16, %rax
3141; CHECK-NEXT:    jne .LBB90_1
3142; CHECK-NEXT:  # %bb.2: # %bb10
3143; CHECK-NEXT:    retq
3144bb:
3145  br label %bb1
3146
3147bb1:                                              ; preds = %bb1, %bb
3148  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3149  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3150  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3151  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3152  %tmp5 = icmp ugt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
3153  %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
3154  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3155  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3156  %tmp8 = add i64 %tmp, 4
3157  %tmp9 = icmp eq i64 %tmp8, 1024
3158  br i1 %tmp9, label %bb10, label %bb1
3159
3160bb10:                                             ; preds = %bb1
3161  ret void
3162}
3163
3164define void @bcast_unfold_umax_v8i32(i32* %arg) {
3165; CHECK-LABEL: bcast_unfold_umax_v8i32:
3166; CHECK:       # %bb.0: # %bb
3167; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3168; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
3169; CHECK-NEXT:    .p2align 4, 0x90
3170; CHECK-NEXT:  .LBB91_1: # %bb1
3171; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3172; CHECK-NEXT:    vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1
3173; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
3174; CHECK-NEXT:    addq $32, %rax
3175; CHECK-NEXT:    jne .LBB91_1
3176; CHECK-NEXT:  # %bb.2: # %bb10
3177; CHECK-NEXT:    vzeroupper
3178; CHECK-NEXT:    retq
3179bb:
3180  br label %bb1
3181
3182bb1:                                              ; preds = %bb1, %bb
3183  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3184  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3185  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3186  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3187  %tmp5 = icmp ugt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3188  %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3189  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3190  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3191  %tmp8 = add i64 %tmp, 8
3192  %tmp9 = icmp eq i64 %tmp8, 1024
3193  br i1 %tmp9, label %bb10, label %bb1
3194
3195bb10:                                             ; preds = %bb1
3196  ret void
3197}
3198
3199define void @bcast_unfold_umax_v16i32(i32* %arg) {
3200; CHECK-LABEL: bcast_unfold_umax_v16i32:
3201; CHECK:       # %bb.0: # %bb
3202; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3203; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
3204; CHECK-NEXT:    .p2align 4, 0x90
3205; CHECK-NEXT:  .LBB92_1: # %bb1
3206; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3207; CHECK-NEXT:    vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1
3208; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
3209; CHECK-NEXT:    addq $64, %rax
3210; CHECK-NEXT:    jne .LBB92_1
3211; CHECK-NEXT:  # %bb.2: # %bb10
3212; CHECK-NEXT:    vzeroupper
3213; CHECK-NEXT:    retq
3214bb:
3215  br label %bb1
3216
3217bb1:                                              ; preds = %bb1, %bb
3218  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3219  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3220  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3221  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3222  %tmp5 = icmp ugt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3223  %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
3224  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3225  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3226  %tmp8 = add i64 %tmp, 16
3227  %tmp9 = icmp eq i64 %tmp8, 1024
3228  br i1 %tmp9, label %bb10, label %bb1
3229
3230bb10:                                             ; preds = %bb1
3231  ret void
3232}
3233
3234define void @bcast_unfold_umax_v2i64(i64* %arg) {
3235; CHECK-LABEL: bcast_unfold_umax_v2i64:
3236; CHECK:       # %bb.0: # %bb
3237; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3238; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,2]
3239; CHECK-NEXT:    .p2align 4, 0x90
3240; CHECK-NEXT:  .LBB93_1: # %bb1
3241; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3242; CHECK-NEXT:    vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1
3243; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
3244; CHECK-NEXT:    addq $16, %rax
3245; CHECK-NEXT:    jne .LBB93_1
3246; CHECK-NEXT:  # %bb.2: # %bb10
3247; CHECK-NEXT:    retq
3248bb:
3249  br label %bb1
3250
3251bb1:                                              ; preds = %bb1, %bb
3252  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3253  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3254  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3255  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8
3256  %tmp5 = icmp ugt <2 x i64> %tmp4, <i64 2, i64 2>
3257  %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2>
3258  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3259  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8
3260  %tmp8 = add i64 %tmp, 2
3261  %tmp9 = icmp eq i64 %tmp8, 1024
3262  br i1 %tmp9, label %bb10, label %bb1
3263
3264bb10:                                             ; preds = %bb1
3265  ret void
3266}
3267
3268define void @bcast_unfold_umax_v4i64(i64* %arg) {
3269; CHECK-LABEL: bcast_unfold_umax_v4i64:
3270; CHECK:       # %bb.0: # %bb
3271; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3272; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
3273; CHECK-NEXT:    .p2align 4, 0x90
3274; CHECK-NEXT:  .LBB94_1: # %bb1
3275; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3276; CHECK-NEXT:    vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1
3277; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
3278; CHECK-NEXT:    addq $32, %rax
3279; CHECK-NEXT:    jne .LBB94_1
3280; CHECK-NEXT:  # %bb.2: # %bb10
3281; CHECK-NEXT:    vzeroupper
3282; CHECK-NEXT:    retq
3283bb:
3284  br label %bb1
3285
3286bb1:                                              ; preds = %bb1, %bb
3287  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3288  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3289  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3290  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
3291  %tmp5 = icmp ugt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
3292  %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2>
3293  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3294  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
3295  %tmp8 = add i64 %tmp, 4
3296  %tmp9 = icmp eq i64 %tmp8, 1024
3297  br i1 %tmp9, label %bb10, label %bb1
3298
3299bb10:                                             ; preds = %bb1
3300  ret void
3301}
3302
3303define void @bcast_unfold_umax_v8i64(i64* %arg) {
3304; CHECK-LABEL: bcast_unfold_umax_v8i64:
3305; CHECK:       # %bb.0: # %bb
3306; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3307; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
3308; CHECK-NEXT:    .p2align 4, 0x90
3309; CHECK-NEXT:  .LBB95_1: # %bb1
3310; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3311; CHECK-NEXT:    vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1
3312; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
3313; CHECK-NEXT:    addq $64, %rax
3314; CHECK-NEXT:    jne .LBB95_1
3315; CHECK-NEXT:  # %bb.2: # %bb10
3316; CHECK-NEXT:    vzeroupper
3317; CHECK-NEXT:    retq
3318bb:
3319  br label %bb1
3320
3321bb1:                                              ; preds = %bb1, %bb
3322  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3323  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3324  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3325  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8
3326  %tmp5 = icmp ugt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3327  %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
3328  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3329  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8
3330  %tmp8 = add i64 %tmp, 8
3331  %tmp9 = icmp eq i64 %tmp8, 1024
3332  br i1 %tmp9, label %bb10, label %bb1
3333
3334bb10:                                             ; preds = %bb1
3335  ret void
3336}
3337
3338define void @bcast_unfold_pcmpgt_v4i32(i32* %arg) {
3339; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32:
3340; CHECK:       # %bb.0: # %bb
3341; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3342; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3343; CHECK-NEXT:    .p2align 4, 0x90
3344; CHECK-NEXT:  .LBB96_1: # %bb1
3345; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3346; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
3347; CHECK-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
3348; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
3349; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
3350; CHECK-NEXT:    addq $16, %rax
3351; CHECK-NEXT:    jne .LBB96_1
3352; CHECK-NEXT:  # %bb.2: # %bb10
3353; CHECK-NEXT:    retq
3354bb:
3355  br label %bb1
3356
3357bb1:                                              ; preds = %bb1, %bb
3358  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3359  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3360  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3361  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3362  %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3363  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3364  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3365  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3366  %tmp8 = add i64 %tmp, 4
3367  %tmp9 = icmp eq i64 %tmp8, 1024
3368  br i1 %tmp9, label %bb10, label %bb1
3369
3370bb10:                                             ; preds = %bb1
3371  ret void
3372}
3373
3374define void @bcast_unfold_pcmpgt_v8i32(i32* %arg) {
3375; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32:
3376; CHECK:       # %bb.0: # %bb
3377; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3378; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3379; CHECK-NEXT:    .p2align 4, 0x90
3380; CHECK-NEXT:  .LBB97_1: # %bb1
3381; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3382; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %ymm1
3383; CHECK-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
3384; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
3385; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
3386; CHECK-NEXT:    addq $32, %rax
3387; CHECK-NEXT:    jne .LBB97_1
3388; CHECK-NEXT:  # %bb.2: # %bb10
3389; CHECK-NEXT:    vzeroupper
3390; CHECK-NEXT:    retq
3391bb:
3392  br label %bb1
3393
3394bb1:                                              ; preds = %bb1, %bb
3395  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3396  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3397  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3398  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3399  %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3400  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3401  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3402  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3403  %tmp8 = add i64 %tmp, 8
3404  %tmp9 = icmp eq i64 %tmp8, 1024
3405  br i1 %tmp9, label %bb10, label %bb1
3406
3407bb10:                                             ; preds = %bb1
3408  ret void
3409}
3410
3411define void @bcast_unfold_pcmpgt_v16i32(i32* %arg) {
3412; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32:
3413; CHECK:       # %bb.0: # %bb
3414; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3415; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3416; CHECK-NEXT:    .p2align 4, 0x90
3417; CHECK-NEXT:  .LBB98_1: # %bb1
3418; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3419; CHECK-NEXT:    vmovdqu64 4096(%rdi,%rax), %zmm1
3420; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
3421; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
3422; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
3423; CHECK-NEXT:    addq $64, %rax
3424; CHECK-NEXT:    jne .LBB98_1
3425; CHECK-NEXT:  # %bb.2: # %bb10
3426; CHECK-NEXT:    vzeroupper
3427; CHECK-NEXT:    retq
3428bb:
3429  br label %bb1
3430
3431bb1:                                              ; preds = %bb1, %bb
3432  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3433  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3434  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3435  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3436  %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3437  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3438  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3439  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3440  %tmp8 = add i64 %tmp, 16
3441  %tmp9 = icmp eq i64 %tmp8, 1024
3442  br i1 %tmp9, label %bb10, label %bb1
3443
3444bb10:                                             ; preds = %bb1
3445  ret void
3446}
3447
3448define void @bcast_unfold_pcmpgt_v2i64(i64* %arg) {
3449; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64:
3450; CHECK:       # %bb.0: # %bb
3451; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3452; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,1]
3453; CHECK-NEXT:    .p2align 4, 0x90
3454; CHECK-NEXT:  .LBB99_1: # %bb1
3455; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3456; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %xmm1
3457; CHECK-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1
3458; CHECK-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
3459; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
3460; CHECK-NEXT:    addq $16, %rax
3461; CHECK-NEXT:    jne .LBB99_1
3462; CHECK-NEXT:  # %bb.2: # %bb10
3463; CHECK-NEXT:    retq
3464bb:
3465  br label %bb1
3466
3467bb1:                                              ; preds = %bb1, %bb
3468  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3469  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3470  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3471  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3472  %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 1, i64 1>
3473  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3474  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3475  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3476  %tmp8 = add i64 %tmp, 2
3477  %tmp9 = icmp eq i64 %tmp8, 1024
3478  br i1 %tmp9, label %bb10, label %bb1
3479
3480bb10:                                             ; preds = %bb1
3481  ret void
3482}
3483define void @bcast_unfold_pcmpgt_v4i64(i64* %arg) {
3484; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64:
3485; CHECK:       # %bb.0: # %bb
3486; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3487; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3488; CHECK-NEXT:    .p2align 4, 0x90
3489; CHECK-NEXT:  .LBB100_1: # %bb1
3490; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3491; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
3492; CHECK-NEXT:    vpcmpgtq %ymm0, %ymm1, %k1
3493; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
3494; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
3495; CHECK-NEXT:    addq $32, %rax
3496; CHECK-NEXT:    jne .LBB100_1
3497; CHECK-NEXT:  # %bb.2: # %bb10
3498; CHECK-NEXT:    vzeroupper
3499; CHECK-NEXT:    retq
3500bb:
3501  br label %bb1
3502
3503bb1:                                              ; preds = %bb1, %bb
3504  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3505  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3506  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3507  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3508  %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3509  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3510  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3511  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3512  %tmp8 = add i64 %tmp, 4
3513  %tmp9 = icmp eq i64 %tmp8, 1024
3514  br i1 %tmp9, label %bb10, label %bb1
3515
3516bb10:                                             ; preds = %bb1
3517  ret void
3518}
3519
3520define void @bcast_unfold_pcmpgt_v8i64(i64* %arg) {
3521; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64:
3522; CHECK:       # %bb.0: # %bb
3523; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3524; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3525; CHECK-NEXT:    .p2align 4, 0x90
3526; CHECK-NEXT:  .LBB101_1: # %bb1
3527; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3528; CHECK-NEXT:    vmovdqu64 8192(%rdi,%rax), %zmm1
3529; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
3530; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
3531; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
3532; CHECK-NEXT:    addq $64, %rax
3533; CHECK-NEXT:    jne .LBB101_1
3534; CHECK-NEXT:  # %bb.2: # %bb10
3535; CHECK-NEXT:    vzeroupper
3536; CHECK-NEXT:    retq
3537bb:
3538  br label %bb1
3539
3540bb1:                                              ; preds = %bb1, %bb
3541  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3542  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3543  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3544  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3545  %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3546  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3547  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3548  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3549  %tmp8 = add i64 %tmp, 8
3550  %tmp9 = icmp eq i64 %tmp8, 1024
3551  br i1 %tmp9, label %bb10, label %bb1
3552
3553bb10:                                             ; preds = %bb1
3554  ret void
3555}
3556
3557define void @bcast_unfold_pcmpeq_v4i32(i32* %arg) {
3558; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32:
3559; CHECK:       # %bb.0: # %bb
3560; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3561; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3562; CHECK-NEXT:    .p2align 4, 0x90
3563; CHECK-NEXT:  .LBB102_1: # %bb1
3564; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3565; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
3566; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1
3567; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
3568; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
3569; CHECK-NEXT:    addq $16, %rax
3570; CHECK-NEXT:    jne .LBB102_1
3571; CHECK-NEXT:  # %bb.2: # %bb10
3572; CHECK-NEXT:    retq
3573bb:
3574  br label %bb1
3575
3576bb1:                                              ; preds = %bb1, %bb
3577  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3578  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3579  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3580  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3581  %tmp5 = icmp eq <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3582  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3583  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3584  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3585  %tmp8 = add i64 %tmp, 4
3586  %tmp9 = icmp eq i64 %tmp8, 1024
3587  br i1 %tmp9, label %bb10, label %bb1
3588
3589bb10:                                             ; preds = %bb1
3590  ret void
3591}
3592
3593define void @bcast_unfold_pcmpeq_v8i32(i32* %arg) {
3594; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32:
3595; CHECK:       # %bb.0: # %bb
3596; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3597; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3598; CHECK-NEXT:    .p2align 4, 0x90
3599; CHECK-NEXT:  .LBB103_1: # %bb1
3600; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3601; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %ymm1
3602; CHECK-NEXT:    vpcmpeqd %ymm0, %ymm1, %k1
3603; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
3604; CHECK-NEXT:    vmovdqu %ymm1, 4096(%rdi,%rax)
3605; CHECK-NEXT:    addq $32, %rax
3606; CHECK-NEXT:    jne .LBB103_1
3607; CHECK-NEXT:  # %bb.2: # %bb10
3608; CHECK-NEXT:    vzeroupper
3609; CHECK-NEXT:    retq
3610bb:
3611  br label %bb1
3612
3613bb1:                                              ; preds = %bb1, %bb
3614  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3615  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3616  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3617  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3618  %tmp5 = icmp eq <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3619  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3620  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3621  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3622  %tmp8 = add i64 %tmp, 8
3623  %tmp9 = icmp eq i64 %tmp8, 1024
3624  br i1 %tmp9, label %bb10, label %bb1
3625
3626bb10:                                             ; preds = %bb1
3627  ret void
3628}
3629
3630define void @bcast_unfold_pcmpeq_v16i32(i32* %arg) {
3631; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32:
3632; CHECK:       # %bb.0: # %bb
3633; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
3634; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3635; CHECK-NEXT:    .p2align 4, 0x90
3636; CHECK-NEXT:  .LBB104_1: # %bb1
3637; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3638; CHECK-NEXT:    vmovdqu64 4096(%rdi,%rax), %zmm1
3639; CHECK-NEXT:    vpcmpeqd %zmm0, %zmm1, %k1
3640; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
3641; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
3642; CHECK-NEXT:    addq $64, %rax
3643; CHECK-NEXT:    jne .LBB104_1
3644; CHECK-NEXT:  # %bb.2: # %bb10
3645; CHECK-NEXT:    vzeroupper
3646; CHECK-NEXT:    retq
3647bb:
3648  br label %bb1
3649
3650bb1:                                              ; preds = %bb1, %bb
3651  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3652  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3653  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3654  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3655  %tmp5 = icmp eq <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3656  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3657  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3658  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3659  %tmp8 = add i64 %tmp, 16
3660  %tmp9 = icmp eq i64 %tmp8, 1024
3661  br i1 %tmp9, label %bb10, label %bb1
3662
3663bb10:                                             ; preds = %bb1
3664  ret void
3665}
3666
3667define void @bcast_unfold_pcmpeq_v2i64(i64* %arg) {
3668; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64:
3669; CHECK:       # %bb.0: # %bb
3670; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3671; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,1]
3672; CHECK-NEXT:    .p2align 4, 0x90
3673; CHECK-NEXT:  .LBB105_1: # %bb1
3674; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3675; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %xmm1
3676; CHECK-NEXT:    vpcmpeqq %xmm0, %xmm1, %k1
3677; CHECK-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
3678; CHECK-NEXT:    vmovdqu %xmm1, 8192(%rdi,%rax)
3679; CHECK-NEXT:    addq $16, %rax
3680; CHECK-NEXT:    jne .LBB105_1
3681; CHECK-NEXT:  # %bb.2: # %bb10
3682; CHECK-NEXT:    retq
3683bb:
3684  br label %bb1
3685
3686bb1:                                              ; preds = %bb1, %bb
3687  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3688  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3689  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3690  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3691  %tmp5 = icmp eq <2 x i64> %tmp4, <i64 1, i64 1>
3692  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3693  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3694  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3695  %tmp8 = add i64 %tmp, 2
3696  %tmp9 = icmp eq i64 %tmp8, 1024
3697  br i1 %tmp9, label %bb10, label %bb1
3698
3699bb10:                                             ; preds = %bb1
3700  ret void
3701}
3702define void @bcast_unfold_pcmpeq_v4i64(i64* %arg) {
3703; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64:
3704; CHECK:       # %bb.0: # %bb
3705; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3706; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3707; CHECK-NEXT:    .p2align 4, 0x90
3708; CHECK-NEXT:  .LBB106_1: # %bb1
3709; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3710; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
3711; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm1, %k1
3712; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
3713; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
3714; CHECK-NEXT:    addq $32, %rax
3715; CHECK-NEXT:    jne .LBB106_1
3716; CHECK-NEXT:  # %bb.2: # %bb10
3717; CHECK-NEXT:    vzeroupper
3718; CHECK-NEXT:    retq
3719bb:
3720  br label %bb1
3721
3722bb1:                                              ; preds = %bb1, %bb
3723  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3724  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3725  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3726  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3727  %tmp5 = icmp eq <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3728  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3729  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3730  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3731  %tmp8 = add i64 %tmp, 4
3732  %tmp9 = icmp eq i64 %tmp8, 1024
3733  br i1 %tmp9, label %bb10, label %bb1
3734
3735bb10:                                             ; preds = %bb1
3736  ret void
3737}
3738
3739define void @bcast_unfold_pcmpeq_v8i64(i64* %arg) {
3740; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64:
3741; CHECK:       # %bb.0: # %bb
3742; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
3743; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3744; CHECK-NEXT:    .p2align 4, 0x90
3745; CHECK-NEXT:  .LBB107_1: # %bb1
3746; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3747; CHECK-NEXT:    vmovdqu64 8192(%rdi,%rax), %zmm1
3748; CHECK-NEXT:    vpcmpeqq %zmm0, %zmm1, %k1
3749; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
3750; CHECK-NEXT:    vmovdqu64 %zmm1, 8192(%rdi,%rax)
3751; CHECK-NEXT:    addq $64, %rax
3752; CHECK-NEXT:    jne .LBB107_1
3753; CHECK-NEXT:  # %bb.2: # %bb10
3754; CHECK-NEXT:    vzeroupper
3755; CHECK-NEXT:    retq
3756bb:
3757  br label %bb1
3758
3759bb1:                                              ; preds = %bb1, %bb
3760  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3761  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3762  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3763  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3764  %tmp5 = icmp eq <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3765  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3766  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3767  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3768  %tmp8 = add i64 %tmp, 8
3769  %tmp9 = icmp eq i64 %tmp8, 1024
3770  br i1 %tmp9, label %bb10, label %bb1
3771
3772bb10:                                             ; preds = %bb1
3773  ret void
3774}
3775
3776define void @bcast_unfold_pcmp_v4i32(i32* %arg) {
3777; CHECK-LABEL: bcast_unfold_pcmp_v4i32:
3778; CHECK:       # %bb.0: # %bb
3779; CHECK-NEXT:    xorl %eax, %eax
3780; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
3781; CHECK-NEXT:    .p2align 4, 0x90
3782; CHECK-NEXT:  .LBB108_1: # %bb1
3783; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3784; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %xmm1
3785; CHECK-NEXT:    vpcmpltd %xmm0, %xmm1, %k1
3786; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
3787; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,4)
3788; CHECK-NEXT:    addq $4, %rax
3789; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
3790; CHECK-NEXT:    jg .LBB108_1
3791; CHECK-NEXT:  # %bb.2: # %bb10
3792; CHECK-NEXT:    retq
3793bb:
3794  br label %bb1
3795
3796bb1:                                              ; preds = %bb1, %bb
3797  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3798  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3799  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
3800  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
3801  %tmp5 = icmp slt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1>
3802  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
3803  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
3804  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
3805  %tmp8 = add i64 %tmp, 4
3806  %tmp9 = icmp slt i64 %tmp8, 1024
3807  br i1 %tmp9, label %bb10, label %bb1
3808
3809bb10:                                             ; preds = %bb1
3810  ret void
3811}
3812
3813define void @bcast_unfold_pcmp_v8i32(i32* %arg) {
3814; CHECK-LABEL: bcast_unfold_pcmp_v8i32:
3815; CHECK:       # %bb.0: # %bb
3816; CHECK-NEXT:    xorl %eax, %eax
3817; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
3818; CHECK-NEXT:    .p2align 4, 0x90
3819; CHECK-NEXT:  .LBB109_1: # %bb1
3820; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3821; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %ymm1
3822; CHECK-NEXT:    vpcmpltd %ymm0, %ymm1, %k1
3823; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
3824; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,4)
3825; CHECK-NEXT:    addq $8, %rax
3826; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
3827; CHECK-NEXT:    jg .LBB109_1
3828; CHECK-NEXT:  # %bb.2: # %bb10
3829; CHECK-NEXT:    vzeroupper
3830; CHECK-NEXT:    retq
3831bb:
3832  br label %bb1
3833
3834bb1:                                              ; preds = %bb1, %bb
3835  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3836  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3837  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
3838  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
3839  %tmp5 = icmp slt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3840  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
3841  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
3842  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
3843  %tmp8 = add i64 %tmp, 8
3844  %tmp9 = icmp slt i64 %tmp8, 1024
3845  br i1 %tmp9, label %bb10, label %bb1
3846
3847bb10:                                             ; preds = %bb1
3848  ret void
3849}
3850
3851define void @bcast_unfold_pcmp_v16i32(i32* %arg) {
3852; CHECK-LABEL: bcast_unfold_pcmp_v16i32:
3853; CHECK:       # %bb.0: # %bb
3854; CHECK-NEXT:    xorl %eax, %eax
3855; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
3856; CHECK-NEXT:    .p2align 4, 0x90
3857; CHECK-NEXT:  .LBB110_1: # %bb1
3858; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3859; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,4), %zmm1
3860; CHECK-NEXT:    vpcmpltd %zmm0, %zmm1, %k1
3861; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
3862; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,4)
3863; CHECK-NEXT:    addq $16, %rax
3864; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
3865; CHECK-NEXT:    jg .LBB110_1
3866; CHECK-NEXT:  # %bb.2: # %bb10
3867; CHECK-NEXT:    vzeroupper
3868; CHECK-NEXT:    retq
3869bb:
3870  br label %bb1
3871
3872bb1:                                              ; preds = %bb1, %bb
3873  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3874  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
3875  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
3876  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
3877  %tmp5 = icmp slt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
3878  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
3879  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
3880  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
3881  %tmp8 = add i64 %tmp, 16
3882  %tmp9 = icmp slt i64 %tmp8, 1024
3883  br i1 %tmp9, label %bb10, label %bb1
3884
3885bb10:                                             ; preds = %bb1
3886  ret void
3887}
3888
3889define void @bcast_unfold_pcmp_v2i64(i64* %arg) {
3890; CHECK-LABEL: bcast_unfold_pcmp_v2i64:
3891; CHECK:       # %bb.0: # %bb
3892; CHECK-NEXT:    xorl %eax, %eax
3893; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,1]
3894; CHECK-NEXT:    .p2align 4, 0x90
3895; CHECK-NEXT:  .LBB111_1: # %bb1
3896; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3897; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %xmm1
3898; CHECK-NEXT:    vpcmpltq %xmm0, %xmm1, %k1
3899; CHECK-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
3900; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,8)
3901; CHECK-NEXT:    addq $2, %rax
3902; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
3903; CHECK-NEXT:    jg .LBB111_1
3904; CHECK-NEXT:  # %bb.2: # %bb10
3905; CHECK-NEXT:    retq
3906bb:
3907  br label %bb1
3908
3909bb1:                                              ; preds = %bb1, %bb
3910  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3911  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3912  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
3913  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
3914  %tmp5 = icmp slt <2 x i64> %tmp4, <i64 1, i64 1>
3915  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
3916  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
3917  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
3918  %tmp8 = add i64 %tmp, 2
3919  %tmp9 = icmp slt i64 %tmp8, 1024
3920  br i1 %tmp9, label %bb10, label %bb1
3921
3922bb10:                                             ; preds = %bb1
3923  ret void
3924}
3925define void @bcast_unfold_pcmp_v4i64(i64* %arg) {
3926; CHECK-LABEL: bcast_unfold_pcmp_v4i64:
3927; CHECK:       # %bb.0: # %bb
3928; CHECK-NEXT:    xorl %eax, %eax
3929; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1]
3930; CHECK-NEXT:    .p2align 4, 0x90
3931; CHECK-NEXT:  .LBB112_1: # %bb1
3932; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3933; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %ymm1
3934; CHECK-NEXT:    vpcmpltq %ymm0, %ymm1, %k1
3935; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
3936; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,8)
3937; CHECK-NEXT:    addq $4, %rax
3938; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
3939; CHECK-NEXT:    jg .LBB112_1
3940; CHECK-NEXT:  # %bb.2: # %bb10
3941; CHECK-NEXT:    vzeroupper
3942; CHECK-NEXT:    retq
3943bb:
3944  br label %bb1
3945
3946bb1:                                              ; preds = %bb1, %bb
3947  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3948  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3949  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
3950  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
3951  %tmp5 = icmp slt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1>
3952  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
3953  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
3954  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
3955  %tmp8 = add i64 %tmp, 4
3956  %tmp9 = icmp slt i64 %tmp8, 1024
3957  br i1 %tmp9, label %bb10, label %bb1
3958
3959bb10:                                             ; preds = %bb1
3960  ret void
3961}
3962
3963define void @bcast_unfold_pcmp_v8i64(i64* %arg) {
3964; CHECK-LABEL: bcast_unfold_pcmp_v8i64:
3965; CHECK:       # %bb.0: # %bb
3966; CHECK-NEXT:    xorl %eax, %eax
3967; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
3968; CHECK-NEXT:    .p2align 4, 0x90
3969; CHECK-NEXT:  .LBB113_1: # %bb1
3970; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
3971; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,8), %zmm1
3972; CHECK-NEXT:    vpcmpltq %zmm0, %zmm1, %k1
3973; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
3974; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,8)
3975; CHECK-NEXT:    addq $8, %rax
3976; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
3977; CHECK-NEXT:    jg .LBB113_1
3978; CHECK-NEXT:  # %bb.2: # %bb10
3979; CHECK-NEXT:    vzeroupper
3980; CHECK-NEXT:    retq
3981bb:
3982  br label %bb1
3983
3984bb1:                                              ; preds = %bb1, %bb
3985  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
3986  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
3987  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
3988  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
3989  %tmp5 = icmp slt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
3990  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
3991  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
3992  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
3993  %tmp8 = add i64 %tmp, 8
3994  %tmp9 = icmp slt i64 %tmp8, 1024
3995  br i1 %tmp9, label %bb10, label %bb1
3996
3997bb10:                                             ; preds = %bb1
3998  ret void
3999}
4000
4001define void @bcast_unfold_pcmpu_v4i32(i32* %arg) {
4002; CHECK-LABEL: bcast_unfold_pcmpu_v4i32:
4003; CHECK:       # %bb.0: # %bb
4004; CHECK-NEXT:    xorl %eax, %eax
4005; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4006; CHECK-NEXT:    .p2align 4, 0x90
4007; CHECK-NEXT:  .LBB114_1: # %bb1
4008; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4009; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %xmm1
4010; CHECK-NEXT:    vpcmpltud %xmm0, %xmm1, %k1
4011; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
4012; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,4)
4013; CHECK-NEXT:    addq $4, %rax
4014; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
4015; CHECK-NEXT:    ja .LBB114_1
4016; CHECK-NEXT:  # %bb.2: # %bb10
4017; CHECK-NEXT:    retq
4018bb:
4019  br label %bb1
4020
4021bb1:                                              ; preds = %bb1, %bb
4022  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4023  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4024  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4025  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4026  %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4027  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4028  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4029  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4030  %tmp8 = add i64 %tmp, 4
4031  %tmp9 = icmp ult i64 %tmp8, 1024
4032  br i1 %tmp9, label %bb10, label %bb1
4033
4034bb10:                                             ; preds = %bb1
4035  ret void
4036}
4037
4038define void @bcast_unfold_pcmpu_v8i32(i32* %arg) {
4039; CHECK-LABEL: bcast_unfold_pcmpu_v8i32:
4040; CHECK:       # %bb.0: # %bb
4041; CHECK-NEXT:    xorl %eax, %eax
4042; CHECK-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2]
4043; CHECK-NEXT:    .p2align 4, 0x90
4044; CHECK-NEXT:  .LBB115_1: # %bb1
4045; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4046; CHECK-NEXT:    vmovdqu (%rdi,%rax,4), %ymm1
4047; CHECK-NEXT:    vpcmpltud %ymm0, %ymm1, %k1
4048; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1 {%k1}
4049; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,4)
4050; CHECK-NEXT:    addq $8, %rax
4051; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
4052; CHECK-NEXT:    ja .LBB115_1
4053; CHECK-NEXT:  # %bb.2: # %bb10
4054; CHECK-NEXT:    vzeroupper
4055; CHECK-NEXT:    retq
4056bb:
4057  br label %bb1
4058
4059bb1:                                              ; preds = %bb1, %bb
4060  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4061  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4062  %tmp3 = bitcast i32* %tmp2 to <8 x i32>*
4063  %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4
4064  %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
4065  %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4
4066  %tmp7 = bitcast i32* %tmp2 to <8 x i32>*
4067  store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4
4068  %tmp8 = add i64 %tmp, 8
4069  %tmp9 = icmp ult i64 %tmp8, 1024
4070  br i1 %tmp9, label %bb10, label %bb1
4071
4072bb10:                                             ; preds = %bb1
4073  ret void
4074}
4075
4076define void @bcast_unfold_pcmpu_v16i32(i32* %arg) {
4077; CHECK-LABEL: bcast_unfold_pcmpu_v16i32:
4078; CHECK:       # %bb.0: # %bb
4079; CHECK-NEXT:    xorl %eax, %eax
4080; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
4081; CHECK-NEXT:    .p2align 4, 0x90
4082; CHECK-NEXT:  .LBB116_1: # %bb1
4083; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4084; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,4), %zmm1
4085; CHECK-NEXT:    vpcmpltud %zmm0, %zmm1, %k1
4086; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm1 {%k1}
4087; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,4)
4088; CHECK-NEXT:    addq $16, %rax
4089; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
4090; CHECK-NEXT:    ja .LBB116_1
4091; CHECK-NEXT:  # %bb.2: # %bb10
4092; CHECK-NEXT:    vzeroupper
4093; CHECK-NEXT:    retq
4094bb:
4095  br label %bb1
4096
4097bb1:                                              ; preds = %bb1, %bb
4098  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4099  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4100  %tmp3 = bitcast i32* %tmp2 to <16 x i32>*
4101  %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4
4102  %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
4103  %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4
4104  %tmp7 = bitcast i32* %tmp2 to <16 x i32>*
4105  store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4
4106  %tmp8 = add i64 %tmp, 16
4107  %tmp9 = icmp ult i64 %tmp8, 1024
4108  br i1 %tmp9, label %bb10, label %bb1
4109
4110bb10:                                             ; preds = %bb1
4111  ret void
4112}
4113
4114define void @bcast_unfold_pcmpu_v2i64(i64* %arg) {
4115; CHECK-LABEL: bcast_unfold_pcmpu_v2i64:
4116; CHECK:       # %bb.0: # %bb
4117; CHECK-NEXT:    xorl %eax, %eax
4118; CHECK-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,2]
4119; CHECK-NEXT:    .p2align 4, 0x90
4120; CHECK-NEXT:  .LBB117_1: # %bb1
4121; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4122; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %xmm1
4123; CHECK-NEXT:    vpcmpltuq %xmm0, %xmm1, %k1
4124; CHECK-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm1 {%k1}
4125; CHECK-NEXT:    vmovdqu %xmm1, (%rdi,%rax,8)
4126; CHECK-NEXT:    addq $2, %rax
4127; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
4128; CHECK-NEXT:    ja .LBB117_1
4129; CHECK-NEXT:  # %bb.2: # %bb10
4130; CHECK-NEXT:    retq
4131bb:
4132  br label %bb1
4133
4134bb1:                                              ; preds = %bb1, %bb
4135  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4136  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4137  %tmp3 = bitcast i64* %tmp2 to <2 x i64>*
4138  %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4
4139  %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2>
4140  %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4
4141  %tmp7 = bitcast i64* %tmp2 to <2 x i64>*
4142  store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4
4143  %tmp8 = add i64 %tmp, 2
4144  %tmp9 = icmp ult i64 %tmp8, 1024
4145  br i1 %tmp9, label %bb10, label %bb1
4146
4147bb10:                                             ; preds = %bb1
4148  ret void
4149}
4150define void @bcast_unfold_pcmpu_v4i64(i64* %arg) {
4151; CHECK-LABEL: bcast_unfold_pcmpu_v4i64:
4152; CHECK:       # %bb.0: # %bb
4153; CHECK-NEXT:    xorl %eax, %eax
4154; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4155; CHECK-NEXT:    .p2align 4, 0x90
4156; CHECK-NEXT:  .LBB118_1: # %bb1
4157; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4158; CHECK-NEXT:    vmovdqu (%rdi,%rax,8), %ymm1
4159; CHECK-NEXT:    vpcmpltuq %ymm0, %ymm1, %k1
4160; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
4161; CHECK-NEXT:    vmovdqu %ymm1, (%rdi,%rax,8)
4162; CHECK-NEXT:    addq $4, %rax
4163; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
4164; CHECK-NEXT:    ja .LBB118_1
4165; CHECK-NEXT:  # %bb.2: # %bb10
4166; CHECK-NEXT:    vzeroupper
4167; CHECK-NEXT:    retq
4168bb:
4169  br label %bb1
4170
4171bb1:                                              ; preds = %bb1, %bb
4172  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4173  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4174  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4175  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4
4176  %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4177  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4178  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4179  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4
4180  %tmp8 = add i64 %tmp, 4
4181  %tmp9 = icmp ult i64 %tmp8, 1024
4182  br i1 %tmp9, label %bb10, label %bb1
4183
4184bb10:                                             ; preds = %bb1
4185  ret void
4186}
4187
4188define void @bcast_unfold_pcmpu_v8i64(i64* %arg) {
4189; CHECK-LABEL: bcast_unfold_pcmpu_v8i64:
4190; CHECK:       # %bb.0: # %bb
4191; CHECK-NEXT:    xorl %eax, %eax
4192; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2]
4193; CHECK-NEXT:    .p2align 4, 0x90
4194; CHECK-NEXT:  .LBB119_1: # %bb1
4195; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4196; CHECK-NEXT:    vmovdqu64 (%rdi,%rax,8), %zmm1
4197; CHECK-NEXT:    vpcmpltuq %zmm0, %zmm1, %k1
4198; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1 {%k1}
4199; CHECK-NEXT:    vmovdqu64 %zmm1, (%rdi,%rax,8)
4200; CHECK-NEXT:    addq $8, %rax
4201; CHECK-NEXT:    cmpq $1023, %rax # imm = 0x3FF
4202; CHECK-NEXT:    ja .LBB119_1
4203; CHECK-NEXT:  # %bb.2: # %bb10
4204; CHECK-NEXT:    vzeroupper
4205; CHECK-NEXT:    retq
4206bb:
4207  br label %bb1
4208
4209bb1:                                              ; preds = %bb1, %bb
4210  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4211  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4212  %tmp3 = bitcast i64* %tmp2 to <8 x i64>*
4213  %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4
4214  %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
4215  %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4
4216  %tmp7 = bitcast i64* %tmp2 to <8 x i64>*
4217  store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4
4218  %tmp8 = add i64 %tmp, 8
4219  %tmp9 = icmp ult i64 %tmp8, 1024
4220  br i1 %tmp9, label %bb10, label %bb1
4221
4222bb10:                                             ; preds = %bb1
4223  ret void
4224}
4225
4226define void @bcast_unfold_cmp_v4f32(float* %arg) {
4227; CHECK-LABEL: bcast_unfold_cmp_v4f32:
4228; CHECK:       # %bb.0: # %bb
4229; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
4230; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4231; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4232; CHECK-NEXT:    .p2align 4, 0x90
4233; CHECK-NEXT:  .LBB120_1: # %bb1
4234; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4235; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %xmm2
4236; CHECK-NEXT:    vcmpltps %xmm0, %xmm2, %k1
4237; CHECK-NEXT:    vblendmps %xmm2, %xmm1, %xmm2 {%k1}
4238; CHECK-NEXT:    vmovups %xmm2, 4096(%rdi,%rax)
4239; CHECK-NEXT:    addq $16, %rax
4240; CHECK-NEXT:    jne .LBB120_1
4241; CHECK-NEXT:  # %bb.2: # %bb10
4242; CHECK-NEXT:    retq
4243bb:
4244  br label %bb1
4245
4246bb1:                                              ; preds = %bb1, %bb
4247  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4248  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4249  %tmp3 = bitcast float* %tmp2 to <4 x float>*
4250  %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4
4251  %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4252  %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4253  %tmp7 = bitcast float* %tmp2 to <4 x float>*
4254  store <4 x float> %tmp6, <4 x float>* %tmp7, align 4
4255  %tmp8 = add i64 %tmp, 4
4256  %tmp9 = icmp eq i64 %tmp8, 1024
4257  br i1 %tmp9, label %bb10, label %bb1
4258
4259bb10:                                             ; preds = %bb1
4260  ret void
4261}
4262
4263define void @bcast_unfold_cmp_v8f32(float* %arg) {
4264; CHECK-LABEL: bcast_unfold_cmp_v8f32:
4265; CHECK:       # %bb.0: # %bb
4266; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
4267; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4268; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4269; CHECK-NEXT:    .p2align 4, 0x90
4270; CHECK-NEXT:  .LBB121_1: # %bb1
4271; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4272; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %ymm2
4273; CHECK-NEXT:    vcmpltps %ymm0, %ymm2, %k1
4274; CHECK-NEXT:    vblendmps %ymm2, %ymm1, %ymm2 {%k1}
4275; CHECK-NEXT:    vmovups %ymm2, 4096(%rdi,%rax)
4276; CHECK-NEXT:    addq $32, %rax
4277; CHECK-NEXT:    jne .LBB121_1
4278; CHECK-NEXT:  # %bb.2: # %bb10
4279; CHECK-NEXT:    vzeroupper
4280; CHECK-NEXT:    retq
4281bb:
4282  br label %bb1
4283
4284bb1:                                              ; preds = %bb1, %bb
4285  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4286  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4287  %tmp3 = bitcast float* %tmp2 to <8 x float>*
4288  %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4
4289  %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4290  %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4291  %tmp7 = bitcast float* %tmp2 to <8 x float>*
4292  store <8 x float> %tmp6, <8 x float>* %tmp7, align 4
4293  %tmp8 = add i64 %tmp, 8
4294  %tmp9 = icmp eq i64 %tmp8, 1024
4295  br i1 %tmp9, label %bb10, label %bb1
4296
4297bb10:                                             ; preds = %bb1
4298  ret void
4299}
4300
4301define void @bcast_unfold_cmp_v16f32(float* %arg) {
4302; CHECK-LABEL: bcast_unfold_cmp_v16f32:
4303; CHECK:       # %bb.0: # %bb
4304; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
4305; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4306; CHECK-NEXT:    vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4307; CHECK-NEXT:    .p2align 4, 0x90
4308; CHECK-NEXT:  .LBB122_1: # %bb1
4309; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4310; CHECK-NEXT:    vmovups 4096(%rdi,%rax), %zmm2
4311; CHECK-NEXT:    vcmpltps %zmm0, %zmm2, %k1
4312; CHECK-NEXT:    vblendmps %zmm2, %zmm1, %zmm2 {%k1}
4313; CHECK-NEXT:    vmovups %zmm2, 4096(%rdi,%rax)
4314; CHECK-NEXT:    addq $64, %rax
4315; CHECK-NEXT:    jne .LBB122_1
4316; CHECK-NEXT:  # %bb.2: # %bb10
4317; CHECK-NEXT:    vzeroupper
4318; CHECK-NEXT:    retq
4319bb:
4320  br label %bb1
4321
4322bb1:                                              ; preds = %bb1, %bb
4323  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4324  %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp
4325  %tmp3 = bitcast float* %tmp2 to <16 x float>*
4326  %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4
4327  %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4328  %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4329  %tmp7 = bitcast float* %tmp2 to <16 x float>*
4330  store <16 x float> %tmp6, <16 x float>* %tmp7, align 4
4331  %tmp8 = add i64 %tmp, 16
4332  %tmp9 = icmp eq i64 %tmp8, 1024
4333  br i1 %tmp9, label %bb10, label %bb1
4334
4335bb10:                                             ; preds = %bb1
4336  ret void
4337}
4338
4339define void @bcast_unfold_cmp_v2f64(double* %arg) {
4340; CHECK-LABEL: bcast_unfold_cmp_v2f64:
4341; CHECK:       # %bb.0: # %bb
4342; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
4343; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0]
4344; CHECK-NEXT:    vmovapd {{.*#+}} xmm1 = [3.0E+0,3.0E+0]
4345; CHECK-NEXT:    .p2align 4, 0x90
4346; CHECK-NEXT:  .LBB123_1: # %bb1
4347; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4348; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %xmm2
4349; CHECK-NEXT:    vcmpltpd %xmm0, %xmm2, %k1
4350; CHECK-NEXT:    vblendmpd %xmm2, %xmm1, %xmm2 {%k1}
4351; CHECK-NEXT:    vmovupd %xmm2, 8192(%rdi,%rax)
4352; CHECK-NEXT:    addq $16, %rax
4353; CHECK-NEXT:    jne .LBB123_1
4354; CHECK-NEXT:  # %bb.2: # %bb10
4355; CHECK-NEXT:    retq
4356bb:
4357  br label %bb1
4358
4359bb1:                                              ; preds = %bb1, %bb
4360  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4361  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4362  %tmp3 = bitcast double* %tmp2 to <2 x double>*
4363  %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8
4364  %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00>
4365  %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 3.000000e+00, double 3.000000e+00>
4366  %tmp7 = bitcast double* %tmp2 to <2 x double>*
4367  store <2 x double> %tmp6, <2 x double>* %tmp7, align 8
4368  %tmp8 = add i64 %tmp, 2
4369  %tmp9 = icmp eq i64 %tmp8, 1024
4370  br i1 %tmp9, label %bb10, label %bb1
4371
4372bb10:                                             ; preds = %bb1
4373  ret void
4374}
4375
4376define void @bcast_unfold_cmp_v4f64(double* %arg) {
4377; CHECK-LABEL: bcast_unfold_cmp_v4f64:
4378; CHECK:       # %bb.0: # %bb
4379; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
4380; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4381; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4382; CHECK-NEXT:    .p2align 4, 0x90
4383; CHECK-NEXT:  .LBB124_1: # %bb1
4384; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4385; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %ymm2
4386; CHECK-NEXT:    vcmpltpd %ymm0, %ymm2, %k1
4387; CHECK-NEXT:    vblendmpd %ymm2, %ymm1, %ymm2 {%k1}
4388; CHECK-NEXT:    vmovupd %ymm2, 8192(%rdi,%rax)
4389; CHECK-NEXT:    addq $32, %rax
4390; CHECK-NEXT:    jne .LBB124_1
4391; CHECK-NEXT:  # %bb.2: # %bb10
4392; CHECK-NEXT:    vzeroupper
4393; CHECK-NEXT:    retq
4394bb:
4395  br label %bb1
4396
4397bb1:                                              ; preds = %bb1, %bb
4398  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4399  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4400  %tmp3 = bitcast double* %tmp2 to <4 x double>*
4401  %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8
4402  %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4403  %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4404  %tmp7 = bitcast double* %tmp2 to <4 x double>*
4405  store <4 x double> %tmp6, <4 x double>* %tmp7, align 8
4406  %tmp8 = add i64 %tmp, 4
4407  %tmp9 = icmp eq i64 %tmp8, 1024
4408  br i1 %tmp9, label %bb10, label %bb1
4409
4410bb10:                                             ; preds = %bb1
4411  ret void
4412}
4413
4414define void @bcast_unfold_cmp_v8f64(double* %arg) {
4415; CHECK-LABEL: bcast_unfold_cmp_v8f64:
4416; CHECK:       # %bb.0: # %bb
4417; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
4418; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4419; CHECK-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4420; CHECK-NEXT:    .p2align 4, 0x90
4421; CHECK-NEXT:  .LBB125_1: # %bb1
4422; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4423; CHECK-NEXT:    vmovupd 8192(%rdi,%rax), %zmm2
4424; CHECK-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
4425; CHECK-NEXT:    vblendmpd %zmm2, %zmm1, %zmm2 {%k1}
4426; CHECK-NEXT:    vmovupd %zmm2, 8192(%rdi,%rax)
4427; CHECK-NEXT:    addq $64, %rax
4428; CHECK-NEXT:    jne .LBB125_1
4429; CHECK-NEXT:  # %bb.2: # %bb10
4430; CHECK-NEXT:    vzeroupper
4431; CHECK-NEXT:    retq
4432bb:
4433  br label %bb1
4434
4435bb1:                                              ; preds = %bb1, %bb
4436  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4437  %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp
4438  %tmp3 = bitcast double* %tmp2 to <8 x double>*
4439  %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8
4440  %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
4441  %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
4442  %tmp7 = bitcast double* %tmp2 to <8 x double>*
4443  store <8 x double> %tmp6, <8 x double>* %tmp7, align 8
4444  %tmp8 = add i64 %tmp, 8
4445  %tmp9 = icmp eq i64 %tmp8, 1024
4446  br i1 %tmp9, label %bb10, label %bb1
4447
4448bb10:                                             ; preds = %bb1
4449  ret void
4450}
4451
4452define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) {
4453; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold:
4454; CHECK:       # %bb.0:
4455; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
4456; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
4457; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
4458; CHECK-NEXT:    .p2align 4, 0x90
4459; CHECK-NEXT:  .LBB126_1: # =>This Inner Loop Header: Depth=1
4460; CHECK-NEXT:    vcmpgtps 4096(%rdi,%rax), %ymm0, %k1
4461; CHECK-NEXT:    vblendmps {{.*}}(%rip){1to8}, %ymm1, %ymm2 {%k1}
4462; CHECK-NEXT:    vmovups %ymm2, 4096(%rdi,%rax)
4463; CHECK-NEXT:    addq $32, %rax
4464; CHECK-NEXT:    jne .LBB126_1
4465; CHECK-NEXT:  # %bb.2:
4466; CHECK-NEXT:    vzeroupper
4467; CHECK-NEXT:    retq
4468  br label %2
4469
44702:                                                ; preds = %2, %1
4471  %3 = phi i64 [ 0, %1 ], [ %10, %2 ]
4472  %4 = getelementptr inbounds float, float* %0, i64 %3
4473  %5 = bitcast float* %4 to <8 x float>*
4474  %6 = load <8 x float>, <8 x float>* %5, align 4
4475  %7 = fcmp olt <8 x float> %6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
4476  %8 = select <8 x i1> %7, <8 x float> <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
4477  %9 = bitcast float* %4 to <8 x float>*
4478  store <8 x float> %8, <8 x float>* %9, align 4
4479  %10 = add i64 %3, 8
4480  %11 = icmp eq i64 %10, 1024
4481  br i1 %11, label %12, label %2
4482
448312:                                               ; preds = %2
4484  ret void
4485}
4486
4487define void @bcast_unfold_ptestm_v4i32(i32* %arg) {
4488; CHECK-LABEL: bcast_unfold_ptestm_v4i32:
4489; CHECK:       # %bb.0: # %bb
4490; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
4491; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4492; CHECK-NEXT:    .p2align 4, 0x90
4493; CHECK-NEXT:  .LBB127_1: # %bb1
4494; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4495; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
4496; CHECK-NEXT:    vptestmd %xmm0, %xmm1, %k1
4497; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
4498; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
4499; CHECK-NEXT:    addq $16, %rax
4500; CHECK-NEXT:    jne .LBB127_1
4501; CHECK-NEXT:  # %bb.2: # %bb10
4502; CHECK-NEXT:    retq
4503bb:
4504  br label %bb1
4505
4506bb1:                                              ; preds = %bb1, %bb
4507  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4508  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4509  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4510  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4511  %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4512  %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer
4513  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4514  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4515  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4516  %tmp8 = add i64 %tmp, 4
4517  %tmp9 = icmp eq i64 %tmp8, 1024
4518  br i1 %tmp9, label %bb10, label %bb1
4519
4520bb10:                                             ; preds = %bb1
4521  ret void
4522}
4523
4524define void @bcast_unfold_ptestnm_v4i32(i32* %arg) {
4525; CHECK-LABEL: bcast_unfold_ptestnm_v4i32:
4526; CHECK:       # %bb.0: # %bb
4527; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
4528; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2]
4529; CHECK-NEXT:    .p2align 4, 0x90
4530; CHECK-NEXT:  .LBB128_1: # %bb1
4531; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4532; CHECK-NEXT:    vmovdqu 4096(%rdi,%rax), %xmm1
4533; CHECK-NEXT:    vptestnmd %xmm0, %xmm1, %k1
4534; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1 {%k1}
4535; CHECK-NEXT:    vmovdqu %xmm1, 4096(%rdi,%rax)
4536; CHECK-NEXT:    addq $16, %rax
4537; CHECK-NEXT:    jne .LBB128_1
4538; CHECK-NEXT:  # %bb.2: # %bb10
4539; CHECK-NEXT:    retq
4540bb:
4541  br label %bb1
4542
4543bb1:                                              ; preds = %bb1, %bb
4544  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4545  %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4546  %tmp3 = bitcast i32* %tmp2 to <4 x i32>*
4547  %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4
4548  %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2>
4549  %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer
4550  %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4
4551  %tmp7 = bitcast i32* %tmp2 to <4 x i32>*
4552  store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4
4553  %tmp8 = add i64 %tmp, 4
4554  %tmp9 = icmp eq i64 %tmp8, 1024
4555  br i1 %tmp9, label %bb10, label %bb1
4556
4557bb10:                                             ; preds = %bb1
4558  ret void
4559}
4560
4561define void @bcast_unfold_ptestm_v4i64(i64* %arg) {
4562; CHECK-LABEL: bcast_unfold_ptestm_v4i64:
4563; CHECK:       # %bb.0: # %bb
4564; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
4565; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4566; CHECK-NEXT:    .p2align 4, 0x90
4567; CHECK-NEXT:  .LBB129_1: # %bb1
4568; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4569; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
4570; CHECK-NEXT:    vptestmq %ymm0, %ymm1, %k1
4571; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
4572; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
4573; CHECK-NEXT:    addq $32, %rax
4574; CHECK-NEXT:    jne .LBB129_1
4575; CHECK-NEXT:  # %bb.2: # %bb10
4576; CHECK-NEXT:    vzeroupper
4577; CHECK-NEXT:    retq
4578bb:
4579  br label %bb1
4580
4581bb1:                                              ; preds = %bb1, %bb
4582  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4583  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4584  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4585  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
4586  %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4587  %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer
4588  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4589  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4590  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
4591  %tmp8 = add i64 %tmp, 4
4592  %tmp9 = icmp eq i64 %tmp8, 1024
4593  br i1 %tmp9, label %bb10, label %bb1
4594
4595bb10:                                             ; preds = %bb1
4596  ret void
4597}
4598
4599define void @bcast_unfold_ptestnm_v4i64(i64* %arg) {
4600; CHECK-LABEL: bcast_unfold_ptestnm_v4i64:
4601; CHECK:       # %bb.0: # %bb
4602; CHECK-NEXT:    movq $-8192, %rax # imm = 0xE000
4603; CHECK-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2]
4604; CHECK-NEXT:    .p2align 4, 0x90
4605; CHECK-NEXT:  .LBB130_1: # %bb1
4606; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4607; CHECK-NEXT:    vmovdqu 8192(%rdi,%rax), %ymm1
4608; CHECK-NEXT:    vptestnmq %ymm0, %ymm1, %k1
4609; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1 {%k1}
4610; CHECK-NEXT:    vmovdqu %ymm1, 8192(%rdi,%rax)
4611; CHECK-NEXT:    addq $32, %rax
4612; CHECK-NEXT:    jne .LBB130_1
4613; CHECK-NEXT:  # %bb.2: # %bb10
4614; CHECK-NEXT:    vzeroupper
4615; CHECK-NEXT:    retq
4616bb:
4617  br label %bb1
4618
4619bb1:                                              ; preds = %bb1, %bb
4620  %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ]
4621  %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp
4622  %tmp3 = bitcast i64* %tmp2 to <4 x i64>*
4623  %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8
4624  %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2>
4625  %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer
4626  %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4
4627  %tmp7 = bitcast i64* %tmp2 to <4 x i64>*
4628  store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8
4629  %tmp8 = add i64 %tmp, 4
4630  %tmp9 = icmp eq i64 %tmp8, 1024
4631  br i1 %tmp9, label %bb10, label %bb1
4632
4633bb10:                                             ; preds = %bb1
4634  ret void
4635}
4636
4637; The or/and pattern here should be turned into vpternlog. The multiply is
4638; there to increase the use count of the loads so they can't fold. We want to
4639; unfold the broadcast and pull it out of the loop.
4640define void @bcast_unfold_vpternlog_v16i32(i32* %arg, i32* %arg1) {
4641; CHECK-LABEL: bcast_unfold_vpternlog_v16i32:
4642; CHECK:       # %bb.0: # %bb
4643; CHECK-NEXT:    movq $-4096, %rax # imm = 0xF000
4644; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
4645; CHECK-NEXT:    .p2align 4, 0x90
4646; CHECK-NEXT:  .LBB131_1: # %bb2
4647; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
4648; CHECK-NEXT:    vmovdqu64 4096(%rdi,%rax), %zmm1
4649; CHECK-NEXT:    vmovdqu64 4096(%rsi,%rax), %zmm2
4650; CHECK-NEXT:    vpmulld %zmm2, %zmm1, %zmm3
4651; CHECK-NEXT:    vpternlogd $216, %zmm0, %zmm1, %zmm2
4652; CHECK-NEXT:    vpmulld %zmm3, %zmm2, %zmm1
4653; CHECK-NEXT:    vmovdqu64 %zmm1, 4096(%rdi,%rax)
4654; CHECK-NEXT:    addq $64, %rax
4655; CHECK-NEXT:    jne .LBB131_1
4656; CHECK-NEXT:  # %bb.2: # %bb20
4657; CHECK-NEXT:    vzeroupper
4658; CHECK-NEXT:    retq
4659bb:
4660  br label %bb2
4661
4662bb2:                                              ; preds = %bb2, %bb
4663  %tmp = phi i64 [ 0, %bb ], [ %tmp18, %bb2 ]
4664  %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp
4665  %tmp4 = bitcast i32* %tmp3 to <16 x i32>*
4666  %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4
4667  %tmp6 = getelementptr inbounds i32, i32* %arg1, i64 %tmp
4668  %tmp10 = bitcast i32* %tmp6 to <16 x i32>*
4669  %tmp11 = load <16 x i32>, <16 x i32>* %tmp10, align 4
4670  %tmp12 = and <16 x i32> %tmp5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
4671  %tmp13 = and <16 x i32> %tmp11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
4672  %tmp14 = or <16 x i32> %tmp12, %tmp13
4673  %tmp15 = mul <16 x i32> %tmp14, %tmp5
4674  %tmp16 = mul <16 x i32> %tmp15, %tmp11
4675  %tmp17 = bitcast i32* %tmp3 to <16 x i32>*
4676  store <16 x i32> %tmp16, <16 x i32>* %tmp17, align 4
4677  %tmp18 = add i64 %tmp, 16
4678  %tmp19 = icmp eq i64 %tmp18, 1024
4679  br i1 %tmp19, label %bb20, label %bb2
4680
4681bb20:                                             ; preds = %bb2
4682  ret void
4683}
4684
4685