1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=avx512vl | FileCheck %s 3 4; Test that we can unfold constant pool loads when we're using avx512's 5; ability to fold a broadcast load into an operation. 6 7define void @bcast_unfold_add_v16i32(i32* %arg) { 8; CHECK-LABEL: bcast_unfold_add_v16i32: 9; CHECK: # %bb.0: # %bb 10; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 11; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 12; CHECK-NEXT: .p2align 4, 0x90 13; CHECK-NEXT: .LBB0_1: # %bb2 14; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 15; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %zmm0, %zmm1 16; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 17; CHECK-NEXT: addq $64, %rax 18; CHECK-NEXT: jne .LBB0_1 19; CHECK-NEXT: # %bb.2: # %bb10 20; CHECK-NEXT: vzeroupper 21; CHECK-NEXT: retq 22bb: 23 br label %bb2 24 25bb2: ; preds = %bb2, %bb 26 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 27 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 28 %tmp4 = bitcast i32* %tmp3 to <16 x i32>* 29 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4 30 %tmp6 = add nsw <16 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 31 %tmp7 = bitcast i32* %tmp3 to <16 x i32>* 32 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 33 %tmp8 = add i64 %tmp, 16 34 %tmp9 = icmp eq i64 %tmp8, 1024 35 br i1 %tmp9, label %bb10, label %bb2 36 37bb10: ; preds = %bb2 38 ret void 39} 40 41define void @bcast_unfold_add_v8i32(i32* %arg) { 42; CHECK-LABEL: bcast_unfold_add_v8i32: 43; CHECK: # %bb.0: # %bb 44; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 45; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] 46; CHECK-NEXT: .p2align 4, 0x90 47; CHECK-NEXT: .LBB1_1: # %bb2 48; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 49; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %ymm0, %ymm1 50; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 51; CHECK-NEXT: addq $32, %rax 52; CHECK-NEXT: jne .LBB1_1 53; CHECK-NEXT: # %bb.2: # %bb10 54; CHECK-NEXT: vzeroupper 55; CHECK-NEXT: retq 56bb: 57 br label %bb2 58 59bb2: ; preds = %bb2, %bb 60 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 61 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 62 %tmp4 = bitcast i32* %tmp3 to <8 x i32>* 63 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4 64 %tmp6 = add nsw <8 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 65 %tmp7 = bitcast i32* %tmp3 to <8 x i32>* 66 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 67 %tmp8 = add i64 %tmp, 8 68 %tmp9 = icmp eq i64 %tmp8, 1024 69 br i1 %tmp9, label %bb10, label %bb2 70 71bb10: ; preds = %bb2 72 ret void 73} 74 75define void @bcast_unfold_add_v4i32(i32* %arg) { 76; CHECK-LABEL: bcast_unfold_add_v4i32: 77; CHECK: # %bb.0: # %bb 78; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 79; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 80; CHECK-NEXT: .p2align 4, 0x90 81; CHECK-NEXT: .LBB2_1: # %bb2 82; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 83; CHECK-NEXT: vpaddd 4096(%rdi,%rax), %xmm0, %xmm1 84; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 85; CHECK-NEXT: addq $16, %rax 86; CHECK-NEXT: jne .LBB2_1 87; CHECK-NEXT: # %bb.2: # %bb10 88; CHECK-NEXT: retq 89bb: 90 br label %bb2 91 92bb2: ; preds = %bb2, %bb 93 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 94 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 95 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 96 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4 97 %tmp6 = add nsw <4 x i32> %tmp5, <i32 2, i32 2, i32 2, i32 2> 98 %tmp7 = bitcast i32* %tmp3 to <4 x i32>* 99 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 100 %tmp8 = add i64 %tmp, 4 101 %tmp9 = icmp eq i64 %tmp8, 1024 102 br i1 %tmp9, label %bb10, label %bb2 103 104bb10: ; preds = %bb2 105 ret void 106} 107 108define void @bcast_unfold_add_v8i64(i64* %arg) { 109; CHECK-LABEL: bcast_unfold_add_v8i64: 110; CHECK: # %bb.0: # %bb 111; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 112; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] 113; CHECK-NEXT: .p2align 4, 0x90 114; CHECK-NEXT: .LBB3_1: # %bb2 115; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 116; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %zmm0, %zmm1 117; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 118; CHECK-NEXT: addq $64, %rax 119; CHECK-NEXT: jne .LBB3_1 120; CHECK-NEXT: # %bb.2: # %bb10 121; CHECK-NEXT: vzeroupper 122; CHECK-NEXT: retq 123bb: 124 br label %bb2 125 126bb2: ; preds = %bb2, %bb 127 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 128 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 129 %tmp4 = bitcast i64* %tmp3 to <8 x i64>* 130 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8 131 %tmp6 = add nsw <8 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 132 %tmp7 = bitcast i64* %tmp3 to <8 x i64>* 133 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8 134 %tmp8 = add i64 %tmp, 8 135 %tmp9 = icmp eq i64 %tmp8, 1024 136 br i1 %tmp9, label %bb10, label %bb2 137 138bb10: ; preds = %bb2 139 ret void 140} 141 142define void @bcast_unfold_add_v4i64(i64* %arg) { 143; CHECK-LABEL: bcast_unfold_add_v4i64: 144; CHECK: # %bb.0: # %bb 145; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 146; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 147; CHECK-NEXT: .p2align 4, 0x90 148; CHECK-NEXT: .LBB4_1: # %bb2 149; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 150; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %ymm0, %ymm1 151; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 152; CHECK-NEXT: addq $32, %rax 153; CHECK-NEXT: jne .LBB4_1 154; CHECK-NEXT: # %bb.2: # %bb10 155; CHECK-NEXT: vzeroupper 156; CHECK-NEXT: retq 157bb: 158 br label %bb2 159 160bb2: ; preds = %bb2, %bb 161 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 162 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 163 %tmp4 = bitcast i64* %tmp3 to <4 x i64>* 164 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8 165 %tmp6 = add nsw <4 x i64> %tmp5, <i64 2, i64 2, i64 2, i64 2> 166 %tmp7 = bitcast i64* %tmp3 to <4 x i64>* 167 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 168 %tmp8 = add i64 %tmp, 4 169 %tmp9 = icmp eq i64 %tmp8, 1024 170 br i1 %tmp9, label %bb10, label %bb2 171 172bb10: ; preds = %bb2 173 ret void 174} 175 176define void @bcast_unfold_add_v2i64(i64* %arg) { 177; CHECK-LABEL: bcast_unfold_add_v2i64: 178; CHECK: # %bb.0: # %bb 179; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 180; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] 181; CHECK-NEXT: .p2align 4, 0x90 182; CHECK-NEXT: .LBB5_1: # %bb2 183; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 184; CHECK-NEXT: vpaddq 8192(%rdi,%rax), %xmm0, %xmm1 185; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) 186; CHECK-NEXT: addq $16, %rax 187; CHECK-NEXT: jne .LBB5_1 188; CHECK-NEXT: # %bb.2: # %bb10 189; CHECK-NEXT: retq 190bb: 191 br label %bb2 192 193bb2: ; preds = %bb2, %bb 194 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 195 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 196 %tmp4 = bitcast i64* %tmp3 to <2 x i64>* 197 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8 198 %tmp6 = add nsw <2 x i64> %tmp5, <i64 2, i64 2> 199 %tmp7 = bitcast i64* %tmp3 to <2 x i64>* 200 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8 201 %tmp8 = add i64 %tmp, 2 202 %tmp9 = icmp eq i64 %tmp8, 1024 203 br i1 %tmp9, label %bb10, label %bb2 204 205bb10: ; preds = %bb2 206 ret void 207} 208 209define void @bcast_unfold_mul_v16i32(i32* %arg) { 210; CHECK-LABEL: bcast_unfold_mul_v16i32: 211; CHECK: # %bb.0: # %bb 212; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 213; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 214; CHECK-NEXT: .p2align 4, 0x90 215; CHECK-NEXT: .LBB6_1: # %bb2 216; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 217; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %zmm0, %zmm1 218; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 219; CHECK-NEXT: addq $64, %rax 220; CHECK-NEXT: jne .LBB6_1 221; CHECK-NEXT: # %bb.2: # %bb10 222; CHECK-NEXT: vzeroupper 223; CHECK-NEXT: retq 224bb: 225 br label %bb2 226 227bb2: ; preds = %bb2, %bb 228 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 229 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 230 %tmp4 = bitcast i32* %tmp3 to <16 x i32>* 231 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4 232 %tmp6 = mul nsw <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 233 %tmp7 = bitcast i32* %tmp3 to <16 x i32>* 234 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 235 %tmp8 = add i64 %tmp, 16 236 %tmp9 = icmp eq i64 %tmp8, 1024 237 br i1 %tmp9, label %bb10, label %bb2 238 239bb10: ; preds = %bb2 240 ret void 241} 242 243define void @bcast_unfold_mul_v8i32(i32* %arg) { 244; CHECK-LABEL: bcast_unfold_mul_v8i32: 245; CHECK: # %bb.0: # %bb 246; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 247; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] 248; CHECK-NEXT: .p2align 4, 0x90 249; CHECK-NEXT: .LBB7_1: # %bb2 250; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 251; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %ymm0, %ymm1 252; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 253; CHECK-NEXT: addq $32, %rax 254; CHECK-NEXT: jne .LBB7_1 255; CHECK-NEXT: # %bb.2: # %bb10 256; CHECK-NEXT: vzeroupper 257; CHECK-NEXT: retq 258bb: 259 br label %bb2 260 261bb2: ; preds = %bb2, %bb 262 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 263 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 264 %tmp4 = bitcast i32* %tmp3 to <8 x i32>* 265 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4 266 %tmp6 = mul nsw <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 267 %tmp7 = bitcast i32* %tmp3 to <8 x i32>* 268 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 269 %tmp8 = add i64 %tmp, 8 270 %tmp9 = icmp eq i64 %tmp8, 1024 271 br i1 %tmp9, label %bb10, label %bb2 272 273bb10: ; preds = %bb2 274 ret void 275} 276 277define void @bcast_unfold_mul_v4i32(i32* %arg) { 278; CHECK-LABEL: bcast_unfold_mul_v4i32: 279; CHECK: # %bb.0: # %bb 280; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 281; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,3,3,3] 282; CHECK-NEXT: .p2align 4, 0x90 283; CHECK-NEXT: .LBB8_1: # %bb2 284; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 285; CHECK-NEXT: vpmulld 4096(%rdi,%rax), %xmm0, %xmm1 286; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 287; CHECK-NEXT: addq $16, %rax 288; CHECK-NEXT: jne .LBB8_1 289; CHECK-NEXT: # %bb.2: # %bb10 290; CHECK-NEXT: retq 291bb: 292 br label %bb2 293 294bb2: ; preds = %bb2, %bb 295 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 296 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 297 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 298 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4 299 %tmp6 = mul nsw <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3> 300 %tmp7 = bitcast i32* %tmp3 to <4 x i32>* 301 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 302 %tmp8 = add i64 %tmp, 4 303 %tmp9 = icmp eq i64 %tmp8, 1024 304 br i1 %tmp9, label %bb10, label %bb2 305 306bb10: ; preds = %bb2 307 ret void 308} 309 310define void @bcast_unfold_mul_v8i64(i64* %arg) { 311; CHECK-LABEL: bcast_unfold_mul_v8i64: 312; CHECK: # %bb.0: # %bb 313; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 314; CHECK-NEXT: .p2align 4, 0x90 315; CHECK-NEXT: .LBB9_1: # %bb2 316; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 317; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0 318; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1 319; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 320; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax) 321; CHECK-NEXT: addq $64, %rax 322; CHECK-NEXT: jne .LBB9_1 323; CHECK-NEXT: # %bb.2: # %bb10 324; CHECK-NEXT: vzeroupper 325; CHECK-NEXT: retq 326bb: 327 br label %bb2 328 329bb2: ; preds = %bb2, %bb 330 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 331 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 332 %tmp4 = bitcast i64* %tmp3 to <8 x i64>* 333 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8 334 %tmp6 = mul nsw <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3> 335 %tmp7 = bitcast i64* %tmp3 to <8 x i64>* 336 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8 337 %tmp8 = add i64 %tmp, 8 338 %tmp9 = icmp eq i64 %tmp8, 1024 339 br i1 %tmp9, label %bb10, label %bb2 340 341bb10: ; preds = %bb2 342 ret void 343} 344 345define void @bcast_unfold_mul_v4i64(i64* %arg) { 346; CHECK-LABEL: bcast_unfold_mul_v4i64: 347; CHECK: # %bb.0: # %bb 348; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 349; CHECK-NEXT: .p2align 4, 0x90 350; CHECK-NEXT: .LBB10_1: # %bb2 351; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 352; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0 353; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 354; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 355; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax) 356; CHECK-NEXT: addq $32, %rax 357; CHECK-NEXT: jne .LBB10_1 358; CHECK-NEXT: # %bb.2: # %bb10 359; CHECK-NEXT: vzeroupper 360; CHECK-NEXT: retq 361bb: 362 br label %bb2 363 364bb2: ; preds = %bb2, %bb 365 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 366 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 367 %tmp4 = bitcast i64* %tmp3 to <4 x i64>* 368 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8 369 %tmp6 = mul nsw <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3> 370 %tmp7 = bitcast i64* %tmp3 to <4 x i64>* 371 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 372 %tmp8 = add i64 %tmp, 4 373 %tmp9 = icmp eq i64 %tmp8, 1024 374 br i1 %tmp9, label %bb10, label %bb2 375 376bb10: ; preds = %bb2 377 ret void 378} 379 380define void @bcast_unfold_mul_v2i64(i64* %arg) { 381; CHECK-LABEL: bcast_unfold_mul_v2i64: 382; CHECK: # %bb.0: # %bb 383; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 384; CHECK-NEXT: .p2align 4, 0x90 385; CHECK-NEXT: .LBB11_1: # %bb2 386; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 387; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0 388; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1 389; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 390; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax) 391; CHECK-NEXT: addq $16, %rax 392; CHECK-NEXT: jne .LBB11_1 393; CHECK-NEXT: # %bb.2: # %bb10 394; CHECK-NEXT: retq 395bb: 396 br label %bb2 397 398bb2: ; preds = %bb2, %bb 399 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 400 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 401 %tmp4 = bitcast i64* %tmp3 to <2 x i64>* 402 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8 403 %tmp6 = mul nsw <2 x i64> %tmp5, <i64 3, i64 3> 404 %tmp7 = bitcast i64* %tmp3 to <2 x i64>* 405 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8 406 %tmp8 = add i64 %tmp, 2 407 %tmp9 = icmp eq i64 %tmp8, 1024 408 br i1 %tmp9, label %bb10, label %bb2 409 410bb10: ; preds = %bb2 411 ret void 412} 413 414define void @bcast_unfold_or_v16i32(i32* %arg) { 415; CHECK-LABEL: bcast_unfold_or_v16i32: 416; CHECK: # %bb.0: # %bb 417; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 418; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 419; CHECK-NEXT: .p2align 4, 0x90 420; CHECK-NEXT: .LBB12_1: # %bb2 421; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 422; CHECK-NEXT: vpord 4096(%rdi,%rax), %zmm0, %zmm1 423; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 424; CHECK-NEXT: addq $64, %rax 425; CHECK-NEXT: jne .LBB12_1 426; CHECK-NEXT: # %bb.2: # %bb10 427; CHECK-NEXT: vzeroupper 428; CHECK-NEXT: retq 429bb: 430 br label %bb2 431 432bb2: ; preds = %bb2, %bb 433 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 434 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 435 %tmp4 = bitcast i32* %tmp3 to <16 x i32>* 436 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4 437 %tmp6 = or <16 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 438 %tmp7 = bitcast i32* %tmp3 to <16 x i32>* 439 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 440 %tmp8 = add i64 %tmp, 16 441 %tmp9 = icmp eq i64 %tmp8, 1024 442 br i1 %tmp9, label %bb10, label %bb2 443 444bb10: ; preds = %bb2 445 ret void 446} 447 448define void @bcast_unfold_or_v8i32(i32* %arg) { 449; CHECK-LABEL: bcast_unfold_or_v8i32: 450; CHECK: # %bb.0: # %bb 451; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 452; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3,3,3,3,3,3,3,3] 453; CHECK-NEXT: .p2align 4, 0x90 454; CHECK-NEXT: .LBB13_1: # %bb2 455; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 456; CHECK-NEXT: vorps 4096(%rdi,%rax), %ymm0, %ymm1 457; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 458; CHECK-NEXT: addq $32, %rax 459; CHECK-NEXT: jne .LBB13_1 460; CHECK-NEXT: # %bb.2: # %bb10 461; CHECK-NEXT: vzeroupper 462; CHECK-NEXT: retq 463bb: 464 br label %bb2 465 466bb2: ; preds = %bb2, %bb 467 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 468 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 469 %tmp4 = bitcast i32* %tmp3 to <8 x i32>* 470 %tmp5 = load <8 x i32>, <8 x i32>* %tmp4, align 4 471 %tmp6 = or <8 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 472 %tmp7 = bitcast i32* %tmp3 to <8 x i32>* 473 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 474 %tmp8 = add i64 %tmp, 8 475 %tmp9 = icmp eq i64 %tmp8, 1024 476 br i1 %tmp9, label %bb10, label %bb2 477 478bb10: ; preds = %bb2 479 ret void 480} 481 482define void @bcast_unfold_or_v4i32(i32* %arg) { 483; CHECK-LABEL: bcast_unfold_or_v4i32: 484; CHECK: # %bb.0: # %bb 485; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 486; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] 487; CHECK-NEXT: .p2align 4, 0x90 488; CHECK-NEXT: .LBB14_1: # %bb2 489; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 490; CHECK-NEXT: vorps 4096(%rdi,%rax), %xmm0, %xmm1 491; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 492; CHECK-NEXT: addq $16, %rax 493; CHECK-NEXT: jne .LBB14_1 494; CHECK-NEXT: # %bb.2: # %bb10 495; CHECK-NEXT: retq 496bb: 497 br label %bb2 498 499bb2: ; preds = %bb2, %bb 500 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 501 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 502 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 503 %tmp5 = load <4 x i32>, <4 x i32>* %tmp4, align 4 504 %tmp6 = or <4 x i32> %tmp5, <i32 3, i32 3, i32 3, i32 3> 505 %tmp7 = bitcast i32* %tmp3 to <4 x i32>* 506 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 507 %tmp8 = add i64 %tmp, 4 508 %tmp9 = icmp eq i64 %tmp8, 1024 509 br i1 %tmp9, label %bb10, label %bb2 510 511bb10: ; preds = %bb2 512 ret void 513} 514 515define void @bcast_unfold_or_v8i64(i64* %arg) { 516; CHECK-LABEL: bcast_unfold_or_v8i64: 517; CHECK: # %bb.0: # %bb 518; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 519; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3] 520; CHECK-NEXT: .p2align 4, 0x90 521; CHECK-NEXT: .LBB15_1: # %bb2 522; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 523; CHECK-NEXT: vporq 8192(%rdi,%rax), %zmm0, %zmm1 524; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 525; CHECK-NEXT: addq $64, %rax 526; CHECK-NEXT: jne .LBB15_1 527; CHECK-NEXT: # %bb.2: # %bb10 528; CHECK-NEXT: vzeroupper 529; CHECK-NEXT: retq 530bb: 531 br label %bb2 532 533bb2: ; preds = %bb2, %bb 534 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 535 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 536 %tmp4 = bitcast i64* %tmp3 to <8 x i64>* 537 %tmp5 = load <8 x i64>, <8 x i64>* %tmp4, align 8 538 %tmp6 = or <8 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3> 539 %tmp7 = bitcast i64* %tmp3 to <8 x i64>* 540 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8 541 %tmp8 = add i64 %tmp, 8 542 %tmp9 = icmp eq i64 %tmp8, 1024 543 br i1 %tmp9, label %bb10, label %bb2 544 545bb10: ; preds = %bb2 546 ret void 547} 548 549define void @bcast_unfold_or_v4i64(i64* %arg) { 550; CHECK-LABEL: bcast_unfold_or_v4i64: 551; CHECK: # %bb.0: # %bb 552; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 553; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3,3,3,3] 554; CHECK-NEXT: .p2align 4, 0x90 555; CHECK-NEXT: .LBB16_1: # %bb2 556; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 557; CHECK-NEXT: vorps 8192(%rdi,%rax), %ymm0, %ymm1 558; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) 559; CHECK-NEXT: addq $32, %rax 560; CHECK-NEXT: jne .LBB16_1 561; CHECK-NEXT: # %bb.2: # %bb10 562; CHECK-NEXT: vzeroupper 563; CHECK-NEXT: retq 564bb: 565 br label %bb2 566 567bb2: ; preds = %bb2, %bb 568 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 569 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 570 %tmp4 = bitcast i64* %tmp3 to <4 x i64>* 571 %tmp5 = load <4 x i64>, <4 x i64>* %tmp4, align 8 572 %tmp6 = or <4 x i64> %tmp5, <i64 3, i64 3, i64 3, i64 3> 573 %tmp7 = bitcast i64* %tmp3 to <4 x i64>* 574 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 575 %tmp8 = add i64 %tmp, 4 576 %tmp9 = icmp eq i64 %tmp8, 1024 577 br i1 %tmp9, label %bb10, label %bb2 578 579bb10: ; preds = %bb2 580 ret void 581} 582 583define void @bcast_unfold_or_v2i64(i64* %arg) { 584; CHECK-LABEL: bcast_unfold_or_v2i64: 585; CHECK: # %bb.0: # %bb 586; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 587; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [3,3] 588; CHECK-NEXT: .p2align 4, 0x90 589; CHECK-NEXT: .LBB17_1: # %bb2 590; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 591; CHECK-NEXT: vorps 8192(%rdi,%rax), %xmm0, %xmm1 592; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) 593; CHECK-NEXT: addq $16, %rax 594; CHECK-NEXT: jne .LBB17_1 595; CHECK-NEXT: # %bb.2: # %bb10 596; CHECK-NEXT: retq 597bb: 598 br label %bb2 599 600bb2: ; preds = %bb2, %bb 601 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb2 ] 602 %tmp3 = getelementptr inbounds i64, i64* %arg, i64 %tmp 603 %tmp4 = bitcast i64* %tmp3 to <2 x i64>* 604 %tmp5 = load <2 x i64>, <2 x i64>* %tmp4, align 8 605 %tmp6 = or <2 x i64> %tmp5, <i64 3, i64 3> 606 %tmp7 = bitcast i64* %tmp3 to <2 x i64>* 607 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8 608 %tmp8 = add i64 %tmp, 2 609 %tmp9 = icmp eq i64 %tmp8, 1024 610 br i1 %tmp9, label %bb10, label %bb2 611 612bb10: ; preds = %bb2 613 ret void 614} 615 616define void @bcast_unfold_fneg_v16f32(float* %arg) { 617; CHECK-LABEL: bcast_unfold_fneg_v16f32: 618; CHECK: # %bb.0: # %bb 619; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 620; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 621; CHECK-NEXT: .p2align 4, 0x90 622; CHECK-NEXT: .LBB18_1: # %bb1 623; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 624; CHECK-NEXT: vpxord 4096(%rdi,%rax), %zmm0, %zmm1 625; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 626; CHECK-NEXT: addq $64, %rax 627; CHECK-NEXT: jne .LBB18_1 628; CHECK-NEXT: # %bb.2: # %bb9 629; CHECK-NEXT: vzeroupper 630; CHECK-NEXT: retq 631bb: 632 br label %bb1 633 634bb1: ; preds = %bb1, %bb 635 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 636 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 637 %tmp3 = bitcast float* %tmp2 to <16 x float>* 638 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 639 %tmp5 = fneg <16 x float> %tmp4 640 %tmp6 = bitcast float* %tmp2 to <16 x float>* 641 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4 642 %tmp7 = add i64 %tmp, 16 643 %tmp8 = icmp eq i64 %tmp7, 1024 644 br i1 %tmp8, label %bb9, label %bb1 645 646bb9: ; preds = %bb1 647 ret void 648} 649 650define void @bcast_unfold_fneg_v8f32(float* %arg) { 651; CHECK-LABEL: bcast_unfold_fneg_v8f32: 652; CHECK: # %bb.0: # %bb 653; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 654; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 655; CHECK-NEXT: .p2align 4, 0x90 656; CHECK-NEXT: .LBB19_1: # %bb1 657; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 658; CHECK-NEXT: vxorps 4096(%rdi,%rax), %ymm0, %ymm1 659; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 660; CHECK-NEXT: addq $32, %rax 661; CHECK-NEXT: jne .LBB19_1 662; CHECK-NEXT: # %bb.2: # %bb9 663; CHECK-NEXT: vzeroupper 664; CHECK-NEXT: retq 665bb: 666 br label %bb1 667 668bb1: ; preds = %bb1, %bb 669 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 670 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 671 %tmp3 = bitcast float* %tmp2 to <8 x float>* 672 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 673 %tmp5 = fneg <8 x float> %tmp4 674 %tmp6 = bitcast float* %tmp2 to <8 x float>* 675 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4 676 %tmp7 = add i64 %tmp, 8 677 %tmp8 = icmp eq i64 %tmp7, 1024 678 br i1 %tmp8, label %bb9, label %bb1 679 680bb9: ; preds = %bb1 681 ret void 682} 683 684define void @bcast_unfold_fneg_v4f32(float* %arg) { 685; CHECK-LABEL: bcast_unfold_fneg_v4f32: 686; CHECK: # %bb.0: # %bb 687; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 688; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 689; CHECK-NEXT: .p2align 4, 0x90 690; CHECK-NEXT: .LBB20_1: # %bb1 691; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 692; CHECK-NEXT: vxorps 4096(%rdi,%rax), %xmm0, %xmm1 693; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 694; CHECK-NEXT: addq $16, %rax 695; CHECK-NEXT: jne .LBB20_1 696; CHECK-NEXT: # %bb.2: # %bb9 697; CHECK-NEXT: retq 698bb: 699 br label %bb1 700 701bb1: ; preds = %bb1, %bb 702 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 703 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 704 %tmp3 = bitcast float* %tmp2 to <4 x float>* 705 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 706 %tmp5 = fneg <4 x float> %tmp4 707 %tmp6 = bitcast float* %tmp2 to <4 x float>* 708 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4 709 %tmp7 = add i64 %tmp, 4 710 %tmp8 = icmp eq i64 %tmp7, 1024 711 br i1 %tmp8, label %bb9, label %bb1 712 713bb9: ; preds = %bb1 714 ret void 715} 716 717define void @bcast_unfold_fneg_v8f64(double* %arg) { 718; CHECK-LABEL: bcast_unfold_fneg_v8f64: 719; CHECK: # %bb.0: # %bb 720; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 721; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 722; CHECK-NEXT: .p2align 4, 0x90 723; CHECK-NEXT: .LBB21_1: # %bb1 724; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 725; CHECK-NEXT: vpxorq 8192(%rdi,%rax), %zmm0, %zmm1 726; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 727; CHECK-NEXT: addq $64, %rax 728; CHECK-NEXT: jne .LBB21_1 729; CHECK-NEXT: # %bb.2: # %bb9 730; CHECK-NEXT: vzeroupper 731; CHECK-NEXT: retq 732bb: 733 br label %bb1 734 735bb1: ; preds = %bb1, %bb 736 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 737 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 738 %tmp3 = bitcast double* %tmp2 to <8 x double>* 739 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 740 %tmp5 = fneg <8 x double> %tmp4 741 %tmp6 = bitcast double* %tmp2 to <8 x double>* 742 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8 743 %tmp7 = add i64 %tmp, 8 744 %tmp8 = icmp eq i64 %tmp7, 1024 745 br i1 %tmp8, label %bb9, label %bb1 746 747bb9: ; preds = %bb1 748 ret void 749} 750 751define void @bcast_unfold_fneg_v4f64(double* %arg) { 752; CHECK-LABEL: bcast_unfold_fneg_v4f64: 753; CHECK: # %bb.0: # %bb 754; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 755; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 756; CHECK-NEXT: .p2align 4, 0x90 757; CHECK-NEXT: .LBB22_1: # %bb1 758; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 759; CHECK-NEXT: vxorps 8192(%rdi,%rax), %ymm0, %ymm1 760; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) 761; CHECK-NEXT: addq $32, %rax 762; CHECK-NEXT: jne .LBB22_1 763; CHECK-NEXT: # %bb.2: # %bb9 764; CHECK-NEXT: vzeroupper 765; CHECK-NEXT: retq 766bb: 767 br label %bb1 768 769bb1: ; preds = %bb1, %bb 770 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 771 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 772 %tmp3 = bitcast double* %tmp2 to <4 x double>* 773 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 774 %tmp5 = fneg <4 x double> %tmp4 775 %tmp6 = bitcast double* %tmp2 to <4 x double>* 776 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8 777 %tmp7 = add i64 %tmp, 4 778 %tmp8 = icmp eq i64 %tmp7, 1024 779 br i1 %tmp8, label %bb9, label %bb1 780 781bb9: ; preds = %bb1 782 ret void 783} 784 785define void @bcast_unfold_fneg_v2f64(double* %arg) { 786; CHECK-LABEL: bcast_unfold_fneg_v2f64: 787; CHECK: # %bb.0: # %bb 788; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 789; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0] 790; CHECK-NEXT: .p2align 4, 0x90 791; CHECK-NEXT: .LBB23_1: # %bb1 792; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 793; CHECK-NEXT: vxorps 8192(%rdi,%rax), %xmm0, %xmm1 794; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) 795; CHECK-NEXT: addq $16, %rax 796; CHECK-NEXT: jne .LBB23_1 797; CHECK-NEXT: # %bb.2: # %bb9 798; CHECK-NEXT: retq 799bb: 800 br label %bb1 801 802bb1: ; preds = %bb1, %bb 803 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 804 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 805 %tmp3 = bitcast double* %tmp2 to <2 x double>* 806 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 807 %tmp5 = fneg <2 x double> %tmp4 808 %tmp6 = bitcast double* %tmp2 to <2 x double>* 809 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8 810 %tmp7 = add i64 %tmp, 2 811 %tmp8 = icmp eq i64 %tmp7, 1024 812 br i1 %tmp8, label %bb9, label %bb1 813 814bb9: ; preds = %bb1 815 ret void 816} 817 818define void @bcast_unfold_fabs_v16f32(float* %arg) { 819; CHECK-LABEL: bcast_unfold_fabs_v16f32: 820; CHECK: # %bb.0: # %bb 821; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 822; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 823; CHECK-NEXT: .p2align 4, 0x90 824; CHECK-NEXT: .LBB24_1: # %bb1 825; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 826; CHECK-NEXT: vpandd 4096(%rdi,%rax), %zmm0, %zmm1 827; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 828; CHECK-NEXT: addq $64, %rax 829; CHECK-NEXT: jne .LBB24_1 830; CHECK-NEXT: # %bb.2: # %bb9 831; CHECK-NEXT: vzeroupper 832; CHECK-NEXT: retq 833bb: 834 br label %bb1 835 836bb1: ; preds = %bb1, %bb 837 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 838 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 839 %tmp3 = bitcast float* %tmp2 to <16 x float>* 840 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 841 %tmp5 = call <16 x float> @llvm.fabs.v16f32(<16 x float> %tmp4) 842 %tmp6 = bitcast float* %tmp2 to <16 x float>* 843 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4 844 %tmp7 = add i64 %tmp, 16 845 %tmp8 = icmp eq i64 %tmp7, 1024 846 br i1 %tmp8, label %bb9, label %bb1 847 848bb9: ; preds = %bb1 849 ret void 850} 851 852; Function Attrs: nounwind readnone speculatable willreturn 853declare <16 x float> @llvm.fabs.v16f32(<16 x float>) #0 854 855define void @bcast_unfold_fabs_v8f32(float* %arg) { 856; CHECK-LABEL: bcast_unfold_fabs_v8f32: 857; CHECK: # %bb.0: # %bb 858; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 859; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 860; CHECK-NEXT: .p2align 4, 0x90 861; CHECK-NEXT: .LBB25_1: # %bb1 862; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 863; CHECK-NEXT: vandps 4096(%rdi,%rax), %ymm0, %ymm1 864; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 865; CHECK-NEXT: addq $32, %rax 866; CHECK-NEXT: jne .LBB25_1 867; CHECK-NEXT: # %bb.2: # %bb9 868; CHECK-NEXT: vzeroupper 869; CHECK-NEXT: retq 870bb: 871 br label %bb1 872 873bb1: ; preds = %bb1, %bb 874 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 875 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 876 %tmp3 = bitcast float* %tmp2 to <8 x float>* 877 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 878 %tmp5 = call <8 x float> @llvm.fabs.v8f32(<8 x float> %tmp4) 879 %tmp6 = bitcast float* %tmp2 to <8 x float>* 880 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4 881 %tmp7 = add i64 %tmp, 8 882 %tmp8 = icmp eq i64 %tmp7, 1024 883 br i1 %tmp8, label %bb9, label %bb1 884 885bb9: ; preds = %bb1 886 ret void 887} 888 889; Function Attrs: nounwind readnone speculatable willreturn 890declare <8 x float> @llvm.fabs.v8f32(<8 x float>) #0 891 892define void @bcast_unfold_fabs_v4f32(float* %arg) { 893; CHECK-LABEL: bcast_unfold_fabs_v4f32: 894; CHECK: # %bb.0: # %bb 895; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 896; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN] 897; CHECK-NEXT: .p2align 4, 0x90 898; CHECK-NEXT: .LBB26_1: # %bb1 899; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 900; CHECK-NEXT: vandps 4096(%rdi,%rax), %xmm0, %xmm1 901; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 902; CHECK-NEXT: addq $16, %rax 903; CHECK-NEXT: jne .LBB26_1 904; CHECK-NEXT: # %bb.2: # %bb9 905; CHECK-NEXT: retq 906bb: 907 br label %bb1 908 909bb1: ; preds = %bb1, %bb 910 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 911 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 912 %tmp3 = bitcast float* %tmp2 to <4 x float>* 913 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 914 %tmp5 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %tmp4) 915 %tmp6 = bitcast float* %tmp2 to <4 x float>* 916 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4 917 %tmp7 = add i64 %tmp, 4 918 %tmp8 = icmp eq i64 %tmp7, 1024 919 br i1 %tmp8, label %bb9, label %bb1 920 921bb9: ; preds = %bb1 922 ret void 923} 924 925; Function Attrs: nounwind readnone speculatable willreturn 926declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #0 927 928define void @bcast_unfold_fabs_v8f64(double* %arg) { 929; CHECK-LABEL: bcast_unfold_fabs_v8f64: 930; CHECK: # %bb.0: # %bb 931; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 932; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 933; CHECK-NEXT: .p2align 4, 0x90 934; CHECK-NEXT: .LBB27_1: # %bb1 935; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 936; CHECK-NEXT: vpandq 8192(%rdi,%rax), %zmm0, %zmm1 937; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 938; CHECK-NEXT: addq $64, %rax 939; CHECK-NEXT: jne .LBB27_1 940; CHECK-NEXT: # %bb.2: # %bb9 941; CHECK-NEXT: vzeroupper 942; CHECK-NEXT: retq 943bb: 944 br label %bb1 945 946bb1: ; preds = %bb1, %bb 947 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 948 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 949 %tmp3 = bitcast double* %tmp2 to <8 x double>* 950 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 951 %tmp5 = call <8 x double> @llvm.fabs.v8f64(<8 x double> %tmp4) 952 %tmp6 = bitcast double* %tmp2 to <8 x double>* 953 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8 954 %tmp7 = add i64 %tmp, 8 955 %tmp8 = icmp eq i64 %tmp7, 1024 956 br i1 %tmp8, label %bb9, label %bb1 957 958bb9: ; preds = %bb1 959 ret void 960} 961 962; Function Attrs: nounwind readnone speculatable willreturn 963declare <8 x double> @llvm.fabs.v8f64(<8 x double>) #0 964 965define void @bcast_unfold_fabs_v4f64(double* %arg) { 966; CHECK-LABEL: bcast_unfold_fabs_v4f64: 967; CHECK: # %bb.0: # %bb 968; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 969; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN] 970; CHECK-NEXT: .p2align 4, 0x90 971; CHECK-NEXT: .LBB28_1: # %bb1 972; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 973; CHECK-NEXT: vandps 8192(%rdi,%rax), %ymm0, %ymm1 974; CHECK-NEXT: vmovups %ymm1, 8192(%rdi,%rax) 975; CHECK-NEXT: addq $32, %rax 976; CHECK-NEXT: jne .LBB28_1 977; CHECK-NEXT: # %bb.2: # %bb9 978; CHECK-NEXT: vzeroupper 979; CHECK-NEXT: retq 980bb: 981 br label %bb1 982 983bb1: ; preds = %bb1, %bb 984 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 985 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 986 %tmp3 = bitcast double* %tmp2 to <4 x double>* 987 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 988 %tmp5 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %tmp4) 989 %tmp6 = bitcast double* %tmp2 to <4 x double>* 990 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8 991 %tmp7 = add i64 %tmp, 4 992 %tmp8 = icmp eq i64 %tmp7, 1024 993 br i1 %tmp8, label %bb9, label %bb1 994 995bb9: ; preds = %bb1 996 ret void 997} 998 999; Function Attrs: nounwind readnone speculatable willreturn 1000declare <4 x double> @llvm.fabs.v4f64(<4 x double>) #0 1001 1002define void @bcast_unfold_fabs_v2f64(double* %arg) { 1003; CHECK-LABEL: bcast_unfold_fabs_v2f64: 1004; CHECK: # %bb.0: # %bb 1005; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1006; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN] 1007; CHECK-NEXT: .p2align 4, 0x90 1008; CHECK-NEXT: .LBB29_1: # %bb1 1009; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1010; CHECK-NEXT: vandps 8192(%rdi,%rax), %xmm0, %xmm1 1011; CHECK-NEXT: vmovups %xmm1, 8192(%rdi,%rax) 1012; CHECK-NEXT: addq $16, %rax 1013; CHECK-NEXT: jne .LBB29_1 1014; CHECK-NEXT: # %bb.2: # %bb9 1015; CHECK-NEXT: retq 1016bb: 1017 br label %bb1 1018 1019bb1: ; preds = %bb1, %bb 1020 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1021 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1022 %tmp3 = bitcast double* %tmp2 to <2 x double>* 1023 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 1024 %tmp5 = call <2 x double> @llvm.fabs.v2f64(<2 x double> %tmp4) 1025 %tmp6 = bitcast double* %tmp2 to <2 x double>* 1026 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8 1027 %tmp7 = add i64 %tmp, 2 1028 %tmp8 = icmp eq i64 %tmp7, 1024 1029 br i1 %tmp8, label %bb9, label %bb1 1030 1031bb9: ; preds = %bb1 1032 ret void 1033} 1034 1035; Function Attrs: nounwind readnone speculatable willreturn 1036declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0 1037 1038define void @bcast_unfold_fadd_v16f32(float* nocapture %arg) { 1039; CHECK-LABEL: bcast_unfold_fadd_v16f32: 1040; CHECK: # %bb.0: # %bb 1041; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1042; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1043; CHECK-NEXT: .p2align 4, 0x90 1044; CHECK-NEXT: .LBB30_1: # %bb1 1045; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1046; CHECK-NEXT: vaddps 4096(%rdi,%rax), %zmm0, %zmm1 1047; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) 1048; CHECK-NEXT: addq $64, %rax 1049; CHECK-NEXT: jne .LBB30_1 1050; CHECK-NEXT: # %bb.2: # %bb9 1051; CHECK-NEXT: vzeroupper 1052; CHECK-NEXT: retq 1053bb: 1054 br label %bb1 1055 1056bb1: ; preds = %bb1, %bb 1057 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1058 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1059 %tmp3 = bitcast float* %tmp2 to <16 x float>* 1060 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 1061 %tmp5 = fadd <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1062 %tmp6 = bitcast float* %tmp2 to <16 x float>* 1063 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4 1064 %tmp7 = add i64 %tmp, 16 1065 %tmp8 = icmp eq i64 %tmp7, 1024 1066 br i1 %tmp8, label %bb9, label %bb1 1067 1068bb9: ; preds = %bb1 1069 ret void 1070} 1071 1072define void @bcast_unfold_fadd_v8f32(float* nocapture %arg) { 1073; CHECK-LABEL: bcast_unfold_fadd_v8f32: 1074; CHECK: # %bb.0: # %bb 1075; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1076; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1077; CHECK-NEXT: .p2align 4, 0x90 1078; CHECK-NEXT: .LBB31_1: # %bb1 1079; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1080; CHECK-NEXT: vaddps 4096(%rdi,%rax), %ymm0, %ymm1 1081; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 1082; CHECK-NEXT: addq $32, %rax 1083; CHECK-NEXT: jne .LBB31_1 1084; CHECK-NEXT: # %bb.2: # %bb9 1085; CHECK-NEXT: vzeroupper 1086; CHECK-NEXT: retq 1087bb: 1088 br label %bb1 1089 1090bb1: ; preds = %bb1, %bb 1091 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1092 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1093 %tmp3 = bitcast float* %tmp2 to <8 x float>* 1094 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 1095 %tmp5 = fadd <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1096 %tmp6 = bitcast float* %tmp2 to <8 x float>* 1097 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4 1098 %tmp7 = add i64 %tmp, 8 1099 %tmp8 = icmp eq i64 %tmp7, 1024 1100 br i1 %tmp8, label %bb9, label %bb1 1101 1102bb9: ; preds = %bb1 1103 ret void 1104} 1105 1106define void @bcast_unfold_fadd_v4f32(float* nocapture %arg) { 1107; CHECK-LABEL: bcast_unfold_fadd_v4f32: 1108; CHECK: # %bb.0: # %bb 1109; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1110; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1111; CHECK-NEXT: .p2align 4, 0x90 1112; CHECK-NEXT: .LBB32_1: # %bb1 1113; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1114; CHECK-NEXT: vaddps 4096(%rdi,%rax), %xmm0, %xmm1 1115; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 1116; CHECK-NEXT: addq $16, %rax 1117; CHECK-NEXT: jne .LBB32_1 1118; CHECK-NEXT: # %bb.2: # %bb9 1119; CHECK-NEXT: retq 1120bb: 1121 br label %bb1 1122 1123bb1: ; preds = %bb1, %bb 1124 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1125 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1126 %tmp3 = bitcast float* %tmp2 to <4 x float>* 1127 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 1128 %tmp5 = fadd <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1129 %tmp6 = bitcast float* %tmp2 to <4 x float>* 1130 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4 1131 %tmp7 = add i64 %tmp, 4 1132 %tmp8 = icmp eq i64 %tmp7, 1024 1133 br i1 %tmp8, label %bb9, label %bb1 1134 1135bb9: ; preds = %bb1 1136 ret void 1137} 1138 1139define void @bcast_unfold_fadd_v8f64(double* nocapture %arg) { 1140; CHECK-LABEL: bcast_unfold_fadd_v8f64: 1141; CHECK: # %bb.0: # %bb 1142; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1143; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1144; CHECK-NEXT: .p2align 4, 0x90 1145; CHECK-NEXT: .LBB33_1: # %bb1 1146; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1147; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %zmm0, %zmm1 1148; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) 1149; CHECK-NEXT: addq $64, %rax 1150; CHECK-NEXT: jne .LBB33_1 1151; CHECK-NEXT: # %bb.2: # %bb9 1152; CHECK-NEXT: vzeroupper 1153; CHECK-NEXT: retq 1154bb: 1155 br label %bb1 1156 1157bb1: ; preds = %bb1, %bb 1158 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1159 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1160 %tmp3 = bitcast double* %tmp2 to <8 x double>* 1161 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 1162 %tmp5 = fadd <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 1163 %tmp6 = bitcast double* %tmp2 to <8 x double>* 1164 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8 1165 %tmp7 = add i64 %tmp, 8 1166 %tmp8 = icmp eq i64 %tmp7, 1024 1167 br i1 %tmp8, label %bb9, label %bb1 1168 1169bb9: ; preds = %bb1 1170 ret void 1171} 1172 1173define void @bcast_unfold_fadd_v4f64(double* nocapture %arg) { 1174; CHECK-LABEL: bcast_unfold_fadd_v4f64: 1175; CHECK: # %bb.0: # %bb 1176; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1177; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1178; CHECK-NEXT: .p2align 4, 0x90 1179; CHECK-NEXT: .LBB34_1: # %bb1 1180; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1181; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %ymm0, %ymm1 1182; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) 1183; CHECK-NEXT: addq $32, %rax 1184; CHECK-NEXT: jne .LBB34_1 1185; CHECK-NEXT: # %bb.2: # %bb9 1186; CHECK-NEXT: vzeroupper 1187; CHECK-NEXT: retq 1188bb: 1189 br label %bb1 1190 1191bb1: ; preds = %bb1, %bb 1192 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1193 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1194 %tmp3 = bitcast double* %tmp2 to <4 x double>* 1195 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 1196 %tmp5 = fadd <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 1197 %tmp6 = bitcast double* %tmp2 to <4 x double>* 1198 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8 1199 %tmp7 = add i64 %tmp, 4 1200 %tmp8 = icmp eq i64 %tmp7, 1024 1201 br i1 %tmp8, label %bb9, label %bb1 1202 1203bb9: ; preds = %bb1 1204 ret void 1205} 1206 1207define void @bcast_unfold_fadd_v2f64(double* nocapture %arg) { 1208; CHECK-LABEL: bcast_unfold_fadd_v2f64: 1209; CHECK: # %bb.0: # %bb 1210; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1211; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] 1212; CHECK-NEXT: .p2align 4, 0x90 1213; CHECK-NEXT: .LBB35_1: # %bb1 1214; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1215; CHECK-NEXT: vaddpd 8192(%rdi,%rax), %xmm0, %xmm1 1216; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) 1217; CHECK-NEXT: addq $16, %rax 1218; CHECK-NEXT: jne .LBB35_1 1219; CHECK-NEXT: # %bb.2: # %bb9 1220; CHECK-NEXT: retq 1221bb: 1222 br label %bb1 1223 1224bb1: ; preds = %bb1, %bb 1225 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1226 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1227 %tmp3 = bitcast double* %tmp2 to <2 x double>* 1228 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 1229 %tmp5 = fadd <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00> 1230 %tmp6 = bitcast double* %tmp2 to <2 x double>* 1231 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8 1232 %tmp7 = add i64 %tmp, 2 1233 %tmp8 = icmp eq i64 %tmp7, 1024 1234 br i1 %tmp8, label %bb9, label %bb1 1235 1236bb9: ; preds = %bb1 1237 ret void 1238} 1239 1240define void @bcast_unfold_fmul_v16f32(float* nocapture %arg) { 1241; CHECK-LABEL: bcast_unfold_fmul_v16f32: 1242; CHECK: # %bb.0: # %bb 1243; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1244; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] 1245; CHECK-NEXT: .p2align 4, 0x90 1246; CHECK-NEXT: .LBB36_1: # %bb1 1247; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1248; CHECK-NEXT: vmulps 4096(%rdi,%rax), %zmm0, %zmm1 1249; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) 1250; CHECK-NEXT: addq $64, %rax 1251; CHECK-NEXT: jne .LBB36_1 1252; CHECK-NEXT: # %bb.2: # %bb9 1253; CHECK-NEXT: vzeroupper 1254; CHECK-NEXT: retq 1255bb: 1256 br label %bb1 1257 1258bb1: ; preds = %bb1, %bb 1259 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1260 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1261 %tmp3 = bitcast float* %tmp2 to <16 x float>* 1262 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 1263 %tmp5 = fmul <16 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> 1264 %tmp6 = bitcast float* %tmp2 to <16 x float>* 1265 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4 1266 %tmp7 = add i64 %tmp, 16 1267 %tmp8 = icmp eq i64 %tmp7, 1024 1268 br i1 %tmp8, label %bb9, label %bb1 1269 1270bb9: ; preds = %bb1 1271 ret void 1272} 1273 1274define void @bcast_unfold_fmul_v8f32(float* nocapture %arg) { 1275; CHECK-LABEL: bcast_unfold_fmul_v8f32: 1276; CHECK: # %bb.0: # %bb 1277; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1278; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] 1279; CHECK-NEXT: .p2align 4, 0x90 1280; CHECK-NEXT: .LBB37_1: # %bb1 1281; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1282; CHECK-NEXT: vmulps 4096(%rdi,%rax), %ymm0, %ymm1 1283; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 1284; CHECK-NEXT: addq $32, %rax 1285; CHECK-NEXT: jne .LBB37_1 1286; CHECK-NEXT: # %bb.2: # %bb9 1287; CHECK-NEXT: vzeroupper 1288; CHECK-NEXT: retq 1289bb: 1290 br label %bb1 1291 1292bb1: ; preds = %bb1, %bb 1293 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1294 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1295 %tmp3 = bitcast float* %tmp2 to <8 x float>* 1296 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 1297 %tmp5 = fmul <8 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> 1298 %tmp6 = bitcast float* %tmp2 to <8 x float>* 1299 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4 1300 %tmp7 = add i64 %tmp, 8 1301 %tmp8 = icmp eq i64 %tmp7, 1024 1302 br i1 %tmp8, label %bb9, label %bb1 1303 1304bb9: ; preds = %bb1 1305 ret void 1306} 1307 1308define void @bcast_unfold_fmul_v4f32(float* nocapture %arg) { 1309; CHECK-LABEL: bcast_unfold_fmul_v4f32: 1310; CHECK: # %bb.0: # %bb 1311; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1312; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] 1313; CHECK-NEXT: .p2align 4, 0x90 1314; CHECK-NEXT: .LBB38_1: # %bb1 1315; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1316; CHECK-NEXT: vmulps 4096(%rdi,%rax), %xmm0, %xmm1 1317; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 1318; CHECK-NEXT: addq $16, %rax 1319; CHECK-NEXT: jne .LBB38_1 1320; CHECK-NEXT: # %bb.2: # %bb9 1321; CHECK-NEXT: retq 1322bb: 1323 br label %bb1 1324 1325bb1: ; preds = %bb1, %bb 1326 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1327 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1328 %tmp3 = bitcast float* %tmp2 to <4 x float>* 1329 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 1330 %tmp5 = fmul <4 x float> %tmp4, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> 1331 %tmp6 = bitcast float* %tmp2 to <4 x float>* 1332 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4 1333 %tmp7 = add i64 %tmp, 4 1334 %tmp8 = icmp eq i64 %tmp7, 1024 1335 br i1 %tmp8, label %bb9, label %bb1 1336 1337bb9: ; preds = %bb1 1338 ret void 1339} 1340 1341define void @bcast_unfold_fmul_v8f64(double* nocapture %arg) { 1342; CHECK-LABEL: bcast_unfold_fmul_v8f64: 1343; CHECK: # %bb.0: # %bb 1344; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1345; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] 1346; CHECK-NEXT: .p2align 4, 0x90 1347; CHECK-NEXT: .LBB39_1: # %bb1 1348; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1349; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %zmm0, %zmm1 1350; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) 1351; CHECK-NEXT: addq $64, %rax 1352; CHECK-NEXT: jne .LBB39_1 1353; CHECK-NEXT: # %bb.2: # %bb9 1354; CHECK-NEXT: vzeroupper 1355; CHECK-NEXT: retq 1356bb: 1357 br label %bb1 1358 1359bb1: ; preds = %bb1, %bb 1360 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1361 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1362 %tmp3 = bitcast double* %tmp2 to <8 x double>* 1363 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 1364 %tmp5 = fmul <8 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00> 1365 %tmp6 = bitcast double* %tmp2 to <8 x double>* 1366 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8 1367 %tmp7 = add i64 %tmp, 8 1368 %tmp8 = icmp eq i64 %tmp7, 1024 1369 br i1 %tmp8, label %bb9, label %bb1 1370 1371bb9: ; preds = %bb1 1372 ret void 1373} 1374 1375define void @bcast_unfold_fmul_v4f64(double* nocapture %arg) { 1376; CHECK-LABEL: bcast_unfold_fmul_v4f64: 1377; CHECK: # %bb.0: # %bb 1378; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1379; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] 1380; CHECK-NEXT: .p2align 4, 0x90 1381; CHECK-NEXT: .LBB40_1: # %bb1 1382; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1383; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %ymm0, %ymm1 1384; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) 1385; CHECK-NEXT: addq $32, %rax 1386; CHECK-NEXT: jne .LBB40_1 1387; CHECK-NEXT: # %bb.2: # %bb9 1388; CHECK-NEXT: vzeroupper 1389; CHECK-NEXT: retq 1390bb: 1391 br label %bb1 1392 1393bb1: ; preds = %bb1, %bb 1394 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1395 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1396 %tmp3 = bitcast double* %tmp2 to <4 x double>* 1397 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 1398 %tmp5 = fmul <4 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00> 1399 %tmp6 = bitcast double* %tmp2 to <4 x double>* 1400 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8 1401 %tmp7 = add i64 %tmp, 4 1402 %tmp8 = icmp eq i64 %tmp7, 1024 1403 br i1 %tmp8, label %bb9, label %bb1 1404 1405bb9: ; preds = %bb1 1406 ret void 1407} 1408 1409define void @bcast_unfold_fmul_v2f64(double* nocapture %arg) { 1410; CHECK-LABEL: bcast_unfold_fmul_v2f64: 1411; CHECK: # %bb.0: # %bb 1412; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1413; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [3.0E+0,3.0E+0] 1414; CHECK-NEXT: .p2align 4, 0x90 1415; CHECK-NEXT: .LBB41_1: # %bb1 1416; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1417; CHECK-NEXT: vmulpd 8192(%rdi,%rax), %xmm0, %xmm1 1418; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) 1419; CHECK-NEXT: addq $16, %rax 1420; CHECK-NEXT: jne .LBB41_1 1421; CHECK-NEXT: # %bb.2: # %bb9 1422; CHECK-NEXT: retq 1423bb: 1424 br label %bb1 1425 1426bb1: ; preds = %bb1, %bb 1427 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1428 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1429 %tmp3 = bitcast double* %tmp2 to <2 x double>* 1430 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 1431 %tmp5 = fmul <2 x double> %tmp4, <double 3.000000e+00, double 3.000000e+00> 1432 %tmp6 = bitcast double* %tmp2 to <2 x double>* 1433 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8 1434 %tmp7 = add i64 %tmp, 2 1435 %tmp8 = icmp eq i64 %tmp7, 1024 1436 br i1 %tmp8, label %bb9, label %bb1 1437 1438bb9: ; preds = %bb1 1439 ret void 1440} 1441 1442define void @bcast_unfold_fdiv_v16f32(float* nocapture %arg) { 1443; CHECK-LABEL: bcast_unfold_fdiv_v16f32: 1444; CHECK: # %bb.0: # %bb 1445; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1446; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1447; CHECK-NEXT: .p2align 4, 0x90 1448; CHECK-NEXT: .LBB42_1: # %bb1 1449; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1450; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 1451; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm1 1452; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) 1453; CHECK-NEXT: addq $64, %rax 1454; CHECK-NEXT: jne .LBB42_1 1455; CHECK-NEXT: # %bb.2: # %bb9 1456; CHECK-NEXT: vzeroupper 1457; CHECK-NEXT: retq 1458bb: 1459 br label %bb1 1460 1461bb1: ; preds = %bb1, %bb 1462 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1463 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1464 %tmp3 = bitcast float* %tmp2 to <16 x float>* 1465 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 1466 %tmp5 = fdiv <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1467 %tmp6 = bitcast float* %tmp2 to <16 x float>* 1468 store <16 x float> %tmp5, <16 x float>* %tmp6, align 4 1469 %tmp7 = add i64 %tmp, 16 1470 %tmp8 = icmp eq i64 %tmp7, 1024 1471 br i1 %tmp8, label %bb9, label %bb1 1472 1473bb9: ; preds = %bb1 1474 ret void 1475} 1476 1477define void @bcast_unfold_fdiv_v8f32(float* nocapture %arg) { 1478; CHECK-LABEL: bcast_unfold_fdiv_v8f32: 1479; CHECK: # %bb.0: # %bb 1480; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1481; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1482; CHECK-NEXT: .p2align 4, 0x90 1483; CHECK-NEXT: .LBB43_1: # %bb1 1484; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1485; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 1486; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm1 1487; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 1488; CHECK-NEXT: addq $32, %rax 1489; CHECK-NEXT: jne .LBB43_1 1490; CHECK-NEXT: # %bb.2: # %bb9 1491; CHECK-NEXT: vzeroupper 1492; CHECK-NEXT: retq 1493bb: 1494 br label %bb1 1495 1496bb1: ; preds = %bb1, %bb 1497 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1498 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1499 %tmp3 = bitcast float* %tmp2 to <8 x float>* 1500 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 1501 %tmp5 = fdiv <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1502 %tmp6 = bitcast float* %tmp2 to <8 x float>* 1503 store <8 x float> %tmp5, <8 x float>* %tmp6, align 4 1504 %tmp7 = add i64 %tmp, 8 1505 %tmp8 = icmp eq i64 %tmp7, 1024 1506 br i1 %tmp8, label %bb9, label %bb1 1507 1508bb9: ; preds = %bb1 1509 ret void 1510} 1511 1512define void @bcast_unfold_fdiv_v4f32(float* nocapture %arg) { 1513; CHECK-LABEL: bcast_unfold_fdiv_v4f32: 1514; CHECK: # %bb.0: # %bb 1515; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1516; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1517; CHECK-NEXT: .p2align 4, 0x90 1518; CHECK-NEXT: .LBB44_1: # %bb1 1519; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1520; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 1521; CHECK-NEXT: vdivps %xmm0, %xmm1, %xmm1 1522; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 1523; CHECK-NEXT: addq $16, %rax 1524; CHECK-NEXT: jne .LBB44_1 1525; CHECK-NEXT: # %bb.2: # %bb9 1526; CHECK-NEXT: retq 1527bb: 1528 br label %bb1 1529 1530bb1: ; preds = %bb1, %bb 1531 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1532 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1533 %tmp3 = bitcast float* %tmp2 to <4 x float>* 1534 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 1535 %tmp5 = fdiv <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1536 %tmp6 = bitcast float* %tmp2 to <4 x float>* 1537 store <4 x float> %tmp5, <4 x float>* %tmp6, align 4 1538 %tmp7 = add i64 %tmp, 4 1539 %tmp8 = icmp eq i64 %tmp7, 1024 1540 br i1 %tmp8, label %bb9, label %bb1 1541 1542bb9: ; preds = %bb1 1543 ret void 1544} 1545 1546define void @bcast_unfold_fdiv_v8f64(double* nocapture %arg) { 1547; CHECK-LABEL: bcast_unfold_fdiv_v8f64: 1548; CHECK: # %bb.0: # %bb 1549; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1550; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1551; CHECK-NEXT: .p2align 4, 0x90 1552; CHECK-NEXT: .LBB45_1: # %bb1 1553; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1554; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 1555; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm1 1556; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) 1557; CHECK-NEXT: addq $64, %rax 1558; CHECK-NEXT: jne .LBB45_1 1559; CHECK-NEXT: # %bb.2: # %bb9 1560; CHECK-NEXT: vzeroupper 1561; CHECK-NEXT: retq 1562bb: 1563 br label %bb1 1564 1565bb1: ; preds = %bb1, %bb 1566 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1567 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1568 %tmp3 = bitcast double* %tmp2 to <8 x double>* 1569 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 1570 %tmp5 = fdiv <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 1571 %tmp6 = bitcast double* %tmp2 to <8 x double>* 1572 store <8 x double> %tmp5, <8 x double>* %tmp6, align 8 1573 %tmp7 = add i64 %tmp, 8 1574 %tmp8 = icmp eq i64 %tmp7, 1024 1575 br i1 %tmp8, label %bb9, label %bb1 1576 1577bb9: ; preds = %bb1 1578 ret void 1579} 1580 1581define void @bcast_unfold_fdiv_v4f64(double* nocapture %arg) { 1582; CHECK-LABEL: bcast_unfold_fdiv_v4f64: 1583; CHECK: # %bb.0: # %bb 1584; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1585; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1586; CHECK-NEXT: .p2align 4, 0x90 1587; CHECK-NEXT: .LBB46_1: # %bb1 1588; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1589; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 1590; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm1 1591; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) 1592; CHECK-NEXT: addq $32, %rax 1593; CHECK-NEXT: jne .LBB46_1 1594; CHECK-NEXT: # %bb.2: # %bb9 1595; CHECK-NEXT: vzeroupper 1596; CHECK-NEXT: retq 1597bb: 1598 br label %bb1 1599 1600bb1: ; preds = %bb1, %bb 1601 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1602 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1603 %tmp3 = bitcast double* %tmp2 to <4 x double>* 1604 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 1605 %tmp5 = fdiv <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 1606 %tmp6 = bitcast double* %tmp2 to <4 x double>* 1607 store <4 x double> %tmp5, <4 x double>* %tmp6, align 8 1608 %tmp7 = add i64 %tmp, 4 1609 %tmp8 = icmp eq i64 %tmp7, 1024 1610 br i1 %tmp8, label %bb9, label %bb1 1611 1612bb9: ; preds = %bb1 1613 ret void 1614} 1615 1616define void @bcast_unfold_fdiv_v2f64(double* nocapture %arg) { 1617; CHECK-LABEL: bcast_unfold_fdiv_v2f64: 1618; CHECK: # %bb.0: # %bb 1619; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1620; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] 1621; CHECK-NEXT: .p2align 4, 0x90 1622; CHECK-NEXT: .LBB47_1: # %bb1 1623; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1624; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 1625; CHECK-NEXT: vdivpd %xmm0, %xmm1, %xmm1 1626; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) 1627; CHECK-NEXT: addq $16, %rax 1628; CHECK-NEXT: jne .LBB47_1 1629; CHECK-NEXT: # %bb.2: # %bb9 1630; CHECK-NEXT: retq 1631bb: 1632 br label %bb1 1633 1634bb1: ; preds = %bb1, %bb 1635 %tmp = phi i64 [ 0, %bb ], [ %tmp7, %bb1 ] 1636 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1637 %tmp3 = bitcast double* %tmp2 to <2 x double>* 1638 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 1639 %tmp5 = fdiv <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00> 1640 %tmp6 = bitcast double* %tmp2 to <2 x double>* 1641 store <2 x double> %tmp5, <2 x double>* %tmp6, align 8 1642 %tmp7 = add i64 %tmp, 2 1643 %tmp8 = icmp eq i64 %tmp7, 1024 1644 br i1 %tmp8, label %bb9, label %bb1 1645 1646bb9: ; preds = %bb1 1647 ret void 1648} 1649 1650define void @bcast_unfold_fma213_v4f32(float* %arg) { 1651; CHECK-LABEL: bcast_unfold_fma213_v4f32: 1652; CHECK: # %bb.0: # %bb 1653; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1654; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1655; CHECK-NEXT: .p2align 4, 0x90 1656; CHECK-NEXT: .LBB48_1: # %bb2 1657; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1658; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 1659; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0 1660; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 1661; CHECK-NEXT: addq $16, %rax 1662; CHECK-NEXT: jne .LBB48_1 1663; CHECK-NEXT: # %bb.2: # %bb11 1664; CHECK-NEXT: retq 1665bb: 1666 br label %bb2 1667 1668bb2: ; preds = %bb2, %bb 1669 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ] 1670 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp 1671 %tmp4 = bitcast float* %tmp3 to <4 x float>* 1672 %tmp5 = load <4 x float>, <4 x float>* %tmp4, align 4 1673 %tmp6 = fmul contract <4 x float> %tmp5, %tmp5 1674 %tmp7 = fadd contract <4 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1675 %tmp8 = bitcast float* %tmp3 to <4 x float>* 1676 store <4 x float> %tmp7, <4 x float>* %tmp8, align 4 1677 %tmp9 = add i64 %tmp, 4 1678 %tmp10 = icmp eq i64 %tmp9, 1024 1679 br i1 %tmp10, label %bb11, label %bb2 1680 1681bb11: ; preds = %bb2 1682 ret void 1683} 1684 1685define void @bcast_unfold_fma231_v4f32(float* %arg) { 1686; CHECK-LABEL: bcast_unfold_fma231_v4f32: 1687; CHECK: # %bb.0: # %bb 1688; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1689; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1690; CHECK-NEXT: .p2align 4, 0x90 1691; CHECK-NEXT: .LBB49_1: # %bb1 1692; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1693; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 1694; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 1695; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 1696; CHECK-NEXT: addq $16, %rax 1697; CHECK-NEXT: jne .LBB49_1 1698; CHECK-NEXT: # %bb.2: # %bb10 1699; CHECK-NEXT: retq 1700bb: 1701 br label %bb1 1702 1703bb1: ; preds = %bb1, %bb 1704 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 1705 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1706 %tmp3 = bitcast float* %tmp2 to <4 x float>* 1707 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 1708 %tmp5 = fmul contract <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1709 %tmp6 = fadd contract <4 x float> %tmp4, %tmp5 1710 %tmp7 = bitcast float* %tmp2 to <4 x float>* 1711 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4 1712 %tmp8 = add i64 %tmp, 4 1713 %tmp9 = icmp eq i64 %tmp8, 1024 1714 br i1 %tmp9, label %bb10, label %bb1 1715 1716bb10: ; preds = %bb1 1717 ret void 1718} 1719 1720define void @bcast_unfold_fma213_v8f32(float* %arg) { 1721; CHECK-LABEL: bcast_unfold_fma213_v8f32: 1722; CHECK: # %bb.0: # %bb 1723; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1724; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1725; CHECK-NEXT: .p2align 4, 0x90 1726; CHECK-NEXT: .LBB50_1: # %bb2 1727; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1728; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 1729; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0 1730; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 1731; CHECK-NEXT: addq $32, %rax 1732; CHECK-NEXT: jne .LBB50_1 1733; CHECK-NEXT: # %bb.2: # %bb11 1734; CHECK-NEXT: vzeroupper 1735; CHECK-NEXT: retq 1736bb: 1737 br label %bb2 1738 1739bb2: ; preds = %bb2, %bb 1740 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ] 1741 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp 1742 %tmp4 = bitcast float* %tmp3 to <8 x float>* 1743 %tmp5 = load <8 x float>, <8 x float>* %tmp4, align 4 1744 %tmp6 = fmul contract <8 x float> %tmp5, %tmp5 1745 %tmp7 = fadd contract <8 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1746 %tmp8 = bitcast float* %tmp3 to <8 x float>* 1747 store <8 x float> %tmp7, <8 x float>* %tmp8, align 4 1748 %tmp9 = add i64 %tmp, 8 1749 %tmp10 = icmp eq i64 %tmp9, 1024 1750 br i1 %tmp10, label %bb11, label %bb2 1751 1752bb11: ; preds = %bb2 1753 ret void 1754} 1755 1756define void @bcast_unfold_fma231_v8f32(float* %arg) { 1757; CHECK-LABEL: bcast_unfold_fma231_v8f32: 1758; CHECK: # %bb.0: # %bb 1759; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1760; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1761; CHECK-NEXT: .p2align 4, 0x90 1762; CHECK-NEXT: .LBB51_1: # %bb1 1763; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1764; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 1765; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 1766; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 1767; CHECK-NEXT: addq $32, %rax 1768; CHECK-NEXT: jne .LBB51_1 1769; CHECK-NEXT: # %bb.2: # %bb10 1770; CHECK-NEXT: vzeroupper 1771; CHECK-NEXT: retq 1772bb: 1773 br label %bb1 1774 1775bb1: ; preds = %bb1, %bb 1776 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 1777 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1778 %tmp3 = bitcast float* %tmp2 to <8 x float>* 1779 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 1780 %tmp5 = fmul contract <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1781 %tmp6 = fadd contract <8 x float> %tmp4, %tmp5 1782 %tmp7 = bitcast float* %tmp2 to <8 x float>* 1783 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4 1784 %tmp8 = add i64 %tmp, 8 1785 %tmp9 = icmp eq i64 %tmp8, 1024 1786 br i1 %tmp9, label %bb10, label %bb1 1787 1788bb10: ; preds = %bb1 1789 ret void 1790} 1791 1792define void @bcast_unfold_fma213_v16f32(float* %arg) { 1793; CHECK-LABEL: bcast_unfold_fma213_v16f32: 1794; CHECK: # %bb.0: # %bb 1795; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1796; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1797; CHECK-NEXT: .p2align 4, 0x90 1798; CHECK-NEXT: .LBB52_1: # %bb2 1799; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1800; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 1801; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0 1802; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) 1803; CHECK-NEXT: addq $64, %rax 1804; CHECK-NEXT: jne .LBB52_1 1805; CHECK-NEXT: # %bb.2: # %bb11 1806; CHECK-NEXT: vzeroupper 1807; CHECK-NEXT: retq 1808bb: 1809 br label %bb2 1810 1811bb2: ; preds = %bb2, %bb 1812 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ] 1813 %tmp3 = getelementptr inbounds float, float* %arg, i64 %tmp 1814 %tmp4 = bitcast float* %tmp3 to <16 x float>* 1815 %tmp5 = load <16 x float>, <16 x float>* %tmp4, align 4 1816 %tmp6 = fmul contract <16 x float> %tmp5, %tmp5 1817 %tmp7 = fadd contract <16 x float> %tmp6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1818 %tmp8 = bitcast float* %tmp3 to <16 x float>* 1819 store <16 x float> %tmp7, <16 x float>* %tmp8, align 4 1820 %tmp9 = add i64 %tmp, 16 1821 %tmp10 = icmp eq i64 %tmp9, 1024 1822 br i1 %tmp10, label %bb11, label %bb2 1823 1824bb11: ; preds = %bb2 1825 ret void 1826} 1827 1828define void @bcast_unfold_fma231_v16f32(float* %arg) { 1829; CHECK-LABEL: bcast_unfold_fma231_v16f32: 1830; CHECK: # %bb.0: # %bb 1831; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 1832; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1833; CHECK-NEXT: .p2align 4, 0x90 1834; CHECK-NEXT: .LBB53_1: # %bb1 1835; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1836; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 1837; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 1838; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) 1839; CHECK-NEXT: addq $64, %rax 1840; CHECK-NEXT: jne .LBB53_1 1841; CHECK-NEXT: # %bb.2: # %bb10 1842; CHECK-NEXT: vzeroupper 1843; CHECK-NEXT: retq 1844bb: 1845 br label %bb1 1846 1847bb1: ; preds = %bb1, %bb 1848 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 1849 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 1850 %tmp3 = bitcast float* %tmp2 to <16 x float>* 1851 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 1852 %tmp5 = fmul contract <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 1853 %tmp6 = fadd contract <16 x float> %tmp4, %tmp5 1854 %tmp7 = bitcast float* %tmp2 to <16 x float>* 1855 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4 1856 %tmp8 = add i64 %tmp, 16 1857 %tmp9 = icmp eq i64 %tmp8, 1024 1858 br i1 %tmp9, label %bb10, label %bb1 1859 1860bb10: ; preds = %bb1 1861 ret void 1862} 1863 1864define void @bcast_unfold_fma213_v2f64(double* %arg) { 1865; CHECK-LABEL: bcast_unfold_fma213_v2f64: 1866; CHECK: # %bb.0: # %bb 1867; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1868; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] 1869; CHECK-NEXT: .p2align 4, 0x90 1870; CHECK-NEXT: .LBB54_1: # %bb2 1871; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1872; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 1873; CHECK-NEXT: vfmadd213pd {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0 1874; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) 1875; CHECK-NEXT: addq $16, %rax 1876; CHECK-NEXT: jne .LBB54_1 1877; CHECK-NEXT: # %bb.2: # %bb11 1878; CHECK-NEXT: retq 1879bb: 1880 br label %bb2 1881 1882bb2: ; preds = %bb2, %bb 1883 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ] 1884 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp 1885 %tmp4 = bitcast double* %tmp3 to <2 x double>* 1886 %tmp5 = load <2 x double>, <2 x double>* %tmp4, align 4 1887 %tmp6 = fmul contract <2 x double> %tmp5, %tmp5 1888 %tmp7 = fadd contract <2 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00> 1889 %tmp8 = bitcast double* %tmp3 to <2 x double>* 1890 store <2 x double> %tmp7, <2 x double>* %tmp8, align 8 1891 %tmp9 = add i64 %tmp, 2 1892 %tmp10 = icmp eq i64 %tmp9, 1024 1893 br i1 %tmp10, label %bb11, label %bb2 1894 1895bb11: ; preds = %bb2 1896 ret void 1897} 1898 1899define void @bcast_unfold_fma231_v2f64(double* %arg) { 1900; CHECK-LABEL: bcast_unfold_fma231_v2f64: 1901; CHECK: # %bb.0: # %bb 1902; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1903; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] 1904; CHECK-NEXT: .p2align 4, 0x90 1905; CHECK-NEXT: .LBB55_1: # %bb1 1906; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1907; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 1908; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 1909; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) 1910; CHECK-NEXT: addq $16, %rax 1911; CHECK-NEXT: jne .LBB55_1 1912; CHECK-NEXT: # %bb.2: # %bb10 1913; CHECK-NEXT: retq 1914bb: 1915 br label %bb1 1916 1917bb1: ; preds = %bb1, %bb 1918 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 1919 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1920 %tmp3 = bitcast double* %tmp2 to <2 x double>* 1921 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 1922 %tmp5 = fmul contract <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00> 1923 %tmp6 = fadd contract <2 x double> %tmp4, %tmp5 1924 %tmp7 = bitcast double* %tmp2 to <2 x double>* 1925 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8 1926 %tmp8 = add i64 %tmp, 2 1927 %tmp9 = icmp eq i64 %tmp8, 1024 1928 br i1 %tmp9, label %bb10, label %bb1 1929 1930bb10: ; preds = %bb1 1931 ret void 1932} 1933 1934define void @bcast_unfold_fma213_v4f64(double* %arg) { 1935; CHECK-LABEL: bcast_unfold_fma213_v4f64: 1936; CHECK: # %bb.0: # %bb 1937; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1938; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1939; CHECK-NEXT: .p2align 4, 0x90 1940; CHECK-NEXT: .LBB56_1: # %bb2 1941; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1942; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 1943; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0 1944; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) 1945; CHECK-NEXT: addq $32, %rax 1946; CHECK-NEXT: jne .LBB56_1 1947; CHECK-NEXT: # %bb.2: # %bb11 1948; CHECK-NEXT: vzeroupper 1949; CHECK-NEXT: retq 1950bb: 1951 br label %bb2 1952 1953bb2: ; preds = %bb2, %bb 1954 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ] 1955 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp 1956 %tmp4 = bitcast double* %tmp3 to <4 x double>* 1957 %tmp5 = load <4 x double>, <4 x double>* %tmp4, align 8 1958 %tmp6 = fmul contract <4 x double> %tmp5, %tmp5 1959 %tmp7 = fadd contract <4 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 1960 %tmp8 = bitcast double* %tmp3 to <4 x double>* 1961 store <4 x double> %tmp7, <4 x double>* %tmp8, align 8 1962 %tmp9 = add i64 %tmp, 4 1963 %tmp10 = icmp eq i64 %tmp9, 1024 1964 br i1 %tmp10, label %bb11, label %bb2 1965 1966bb11: ; preds = %bb2 1967 ret void 1968} 1969 1970define void @bcast_unfold_fma231_v4f64(double* %arg) { 1971; CHECK-LABEL: bcast_unfold_fma231_v4f64: 1972; CHECK: # %bb.0: # %bb 1973; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 1974; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 1975; CHECK-NEXT: .p2align 4, 0x90 1976; CHECK-NEXT: .LBB57_1: # %bb1 1977; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 1978; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 1979; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 1980; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) 1981; CHECK-NEXT: addq $32, %rax 1982; CHECK-NEXT: jne .LBB57_1 1983; CHECK-NEXT: # %bb.2: # %bb10 1984; CHECK-NEXT: vzeroupper 1985; CHECK-NEXT: retq 1986bb: 1987 br label %bb1 1988 1989bb1: ; preds = %bb1, %bb 1990 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 1991 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 1992 %tmp3 = bitcast double* %tmp2 to <4 x double>* 1993 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 1994 %tmp5 = fmul contract <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 1995 %tmp6 = fadd contract <4 x double> %tmp4, %tmp5 1996 %tmp7 = bitcast double* %tmp2 to <4 x double>* 1997 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8 1998 %tmp8 = add i64 %tmp, 4 1999 %tmp9 = icmp eq i64 %tmp8, 1024 2000 br i1 %tmp9, label %bb10, label %bb1 2001 2002bb10: ; preds = %bb1 2003 ret void 2004} 2005 2006define void @bcast_unfold_fma213_v8f64(double* %arg) { 2007; CHECK-LABEL: bcast_unfold_fma213_v8f64: 2008; CHECK: # %bb.0: # %bb 2009; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2010; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2011; CHECK-NEXT: .p2align 4, 0x90 2012; CHECK-NEXT: .LBB58_1: # %bb2 2013; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2014; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 2015; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0 2016; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) 2017; CHECK-NEXT: addq $64, %rax 2018; CHECK-NEXT: jne .LBB58_1 2019; CHECK-NEXT: # %bb.2: # %bb11 2020; CHECK-NEXT: vzeroupper 2021; CHECK-NEXT: retq 2022bb: 2023 br label %bb2 2024 2025bb2: ; preds = %bb2, %bb 2026 %tmp = phi i64 [ 0, %bb ], [ %tmp9, %bb2 ] 2027 %tmp3 = getelementptr inbounds double, double* %arg, i64 %tmp 2028 %tmp4 = bitcast double* %tmp3 to <8 x double>* 2029 %tmp5 = load <8 x double>, <8 x double>* %tmp4, align 8 2030 %tmp6 = fmul contract <8 x double> %tmp5, %tmp5 2031 %tmp7 = fadd contract <8 x double> %tmp6, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2032 %tmp8 = bitcast double* %tmp3 to <8 x double>* 2033 store <8 x double> %tmp7, <8 x double>* %tmp8, align 8 2034 %tmp9 = add i64 %tmp, 8 2035 %tmp10 = icmp eq i64 %tmp9, 1024 2036 br i1 %tmp10, label %bb11, label %bb2 2037 2038bb11: ; preds = %bb2 2039 ret void 2040} 2041 2042define void @bcast_unfold_fma231_v8f64(double* %arg) { 2043; CHECK-LABEL: bcast_unfold_fma231_v8f64: 2044; CHECK: # %bb.0: # %bb 2045; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2046; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2047; CHECK-NEXT: .p2align 4, 0x90 2048; CHECK-NEXT: .LBB59_1: # %bb1 2049; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2050; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 2051; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 2052; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) 2053; CHECK-NEXT: addq $64, %rax 2054; CHECK-NEXT: jne .LBB59_1 2055; CHECK-NEXT: # %bb.2: # %bb10 2056; CHECK-NEXT: vzeroupper 2057; CHECK-NEXT: retq 2058bb: 2059 br label %bb1 2060 2061bb1: ; preds = %bb1, %bb 2062 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2063 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 2064 %tmp3 = bitcast double* %tmp2 to <8 x double>* 2065 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 2066 %tmp5 = fmul contract <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2067 %tmp6 = fadd contract <8 x double> %tmp4, %tmp5 2068 %tmp7 = bitcast double* %tmp2 to <8 x double>* 2069 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8 2070 %tmp8 = add i64 %tmp, 8 2071 %tmp9 = icmp eq i64 %tmp8, 1024 2072 br i1 %tmp9, label %bb10, label %bb1 2073 2074bb10: ; preds = %bb1 2075 ret void 2076} 2077 2078define void @bcast_unfold_fmax_v4f32(float* %arg) { 2079; CHECK-LABEL: bcast_unfold_fmax_v4f32: 2080; CHECK: # %bb.0: # %bb 2081; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2082; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2083; CHECK-NEXT: .p2align 4, 0x90 2084; CHECK-NEXT: .LBB60_1: # %bb1 2085; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2086; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 2087; CHECK-NEXT: vmaxps %xmm0, %xmm1, %xmm1 2088; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 2089; CHECK-NEXT: addq $16, %rax 2090; CHECK-NEXT: jne .LBB60_1 2091; CHECK-NEXT: # %bb.2: # %bb10 2092; CHECK-NEXT: retq 2093bb: 2094 br label %bb1 2095 2096bb1: ; preds = %bb1, %bb 2097 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2098 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 2099 %tmp3 = bitcast float* %tmp2 to <4 x float>* 2100 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 2101 %tmp5 = fcmp ogt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2102 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2103 %tmp7 = bitcast float* %tmp2 to <4 x float>* 2104 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4 2105 %tmp8 = add i64 %tmp, 4 2106 %tmp9 = icmp eq i64 %tmp8, 1024 2107 br i1 %tmp9, label %bb10, label %bb1 2108 2109bb10: ; preds = %bb1 2110 ret void 2111} 2112 2113define void @bcast_unfold_fmax_v8f32(float* %arg) { 2114; CHECK-LABEL: bcast_unfold_fmax_v8f32: 2115; CHECK: # %bb.0: # %bb 2116; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2117; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2118; CHECK-NEXT: .p2align 4, 0x90 2119; CHECK-NEXT: .LBB61_1: # %bb1 2120; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2121; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 2122; CHECK-NEXT: vmaxps %ymm0, %ymm1, %ymm1 2123; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 2124; CHECK-NEXT: addq $32, %rax 2125; CHECK-NEXT: jne .LBB61_1 2126; CHECK-NEXT: # %bb.2: # %bb10 2127; CHECK-NEXT: vzeroupper 2128; CHECK-NEXT: retq 2129bb: 2130 br label %bb1 2131 2132bb1: ; preds = %bb1, %bb 2133 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2134 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 2135 %tmp3 = bitcast float* %tmp2 to <8 x float>* 2136 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 2137 %tmp5 = fcmp ogt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2138 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2139 %tmp7 = bitcast float* %tmp2 to <8 x float>* 2140 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4 2141 %tmp8 = add i64 %tmp, 8 2142 %tmp9 = icmp eq i64 %tmp8, 1024 2143 br i1 %tmp9, label %bb10, label %bb1 2144 2145bb10: ; preds = %bb1 2146 ret void 2147} 2148 2149define void @bcast_unfold_fmax_v16f32(float* %arg) { 2150; CHECK-LABEL: bcast_unfold_fmax_v16f32: 2151; CHECK: # %bb.0: # %bb 2152; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2153; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2154; CHECK-NEXT: .p2align 4, 0x90 2155; CHECK-NEXT: .LBB62_1: # %bb1 2156; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2157; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 2158; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm1 2159; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) 2160; CHECK-NEXT: addq $64, %rax 2161; CHECK-NEXT: jne .LBB62_1 2162; CHECK-NEXT: # %bb.2: # %bb10 2163; CHECK-NEXT: vzeroupper 2164; CHECK-NEXT: retq 2165bb: 2166 br label %bb1 2167 2168bb1: ; preds = %bb1, %bb 2169 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2170 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 2171 %tmp3 = bitcast float* %tmp2 to <16 x float>* 2172 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 2173 %tmp5 = fcmp ogt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2174 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2175 %tmp7 = bitcast float* %tmp2 to <16 x float>* 2176 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4 2177 %tmp8 = add i64 %tmp, 16 2178 %tmp9 = icmp eq i64 %tmp8, 1024 2179 br i1 %tmp9, label %bb10, label %bb1 2180 2181bb10: ; preds = %bb1 2182 ret void 2183} 2184 2185define void @bcast_unfold_fmax_v2f64(double* %arg) { 2186; CHECK-LABEL: bcast_unfold_fmax_v2f64: 2187; CHECK: # %bb.0: # %bb 2188; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2189; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] 2190; CHECK-NEXT: .p2align 4, 0x90 2191; CHECK-NEXT: .LBB63_1: # %bb1 2192; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2193; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 2194; CHECK-NEXT: vmaxpd %xmm0, %xmm1, %xmm1 2195; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) 2196; CHECK-NEXT: addq $16, %rax 2197; CHECK-NEXT: jne .LBB63_1 2198; CHECK-NEXT: # %bb.2: # %bb10 2199; CHECK-NEXT: retq 2200bb: 2201 br label %bb1 2202 2203bb1: ; preds = %bb1, %bb 2204 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2205 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 2206 %tmp3 = bitcast double* %tmp2 to <2 x double>* 2207 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 2208 %tmp5 = fcmp ogt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00> 2209 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00> 2210 %tmp7 = bitcast double* %tmp2 to <2 x double>* 2211 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8 2212 %tmp8 = add i64 %tmp, 2 2213 %tmp9 = icmp eq i64 %tmp8, 1024 2214 br i1 %tmp9, label %bb10, label %bb1 2215 2216bb10: ; preds = %bb1 2217 ret void 2218} 2219 2220define void @bcast_unfold_fmax_v4f64(double* %arg) { 2221; CHECK-LABEL: bcast_unfold_fmax_v4f64: 2222; CHECK: # %bb.0: # %bb 2223; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2224; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2225; CHECK-NEXT: .p2align 4, 0x90 2226; CHECK-NEXT: .LBB64_1: # %bb1 2227; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2228; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 2229; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm1 2230; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) 2231; CHECK-NEXT: addq $32, %rax 2232; CHECK-NEXT: jne .LBB64_1 2233; CHECK-NEXT: # %bb.2: # %bb10 2234; CHECK-NEXT: vzeroupper 2235; CHECK-NEXT: retq 2236bb: 2237 br label %bb1 2238 2239bb1: ; preds = %bb1, %bb 2240 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2241 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 2242 %tmp3 = bitcast double* %tmp2 to <4 x double>* 2243 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 2244 %tmp5 = fcmp ogt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2245 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2246 %tmp7 = bitcast double* %tmp2 to <4 x double>* 2247 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8 2248 %tmp8 = add i64 %tmp, 4 2249 %tmp9 = icmp eq i64 %tmp8, 1024 2250 br i1 %tmp9, label %bb10, label %bb1 2251 2252bb10: ; preds = %bb1 2253 ret void 2254} 2255 2256define void @bcast_unfold_fmax_v8f64(double* %arg) { 2257; CHECK-LABEL: bcast_unfold_fmax_v8f64: 2258; CHECK: # %bb.0: # %bb 2259; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2260; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2261; CHECK-NEXT: .p2align 4, 0x90 2262; CHECK-NEXT: .LBB65_1: # %bb1 2263; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2264; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 2265; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm1 2266; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) 2267; CHECK-NEXT: addq $64, %rax 2268; CHECK-NEXT: jne .LBB65_1 2269; CHECK-NEXT: # %bb.2: # %bb10 2270; CHECK-NEXT: vzeroupper 2271; CHECK-NEXT: retq 2272bb: 2273 br label %bb1 2274 2275bb1: ; preds = %bb1, %bb 2276 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2277 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 2278 %tmp3 = bitcast double* %tmp2 to <8 x double>* 2279 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 2280 %tmp5 = fcmp ogt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2281 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2282 %tmp7 = bitcast double* %tmp2 to <8 x double>* 2283 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8 2284 %tmp8 = add i64 %tmp, 8 2285 %tmp9 = icmp eq i64 %tmp8, 1024 2286 br i1 %tmp9, label %bb10, label %bb1 2287 2288bb10: ; preds = %bb1 2289 ret void 2290} 2291 2292define void @bcast_unfold_fmin_v4f32(float* %arg) { 2293; CHECK-LABEL: bcast_unfold_fmin_v4f32: 2294; CHECK: # %bb.0: # %bb 2295; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2296; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2297; CHECK-NEXT: .p2align 4, 0x90 2298; CHECK-NEXT: .LBB66_1: # %bb1 2299; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2300; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 2301; CHECK-NEXT: vminps %xmm0, %xmm1, %xmm1 2302; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) 2303; CHECK-NEXT: addq $16, %rax 2304; CHECK-NEXT: jne .LBB66_1 2305; CHECK-NEXT: # %bb.2: # %bb10 2306; CHECK-NEXT: retq 2307bb: 2308 br label %bb1 2309 2310bb1: ; preds = %bb1, %bb 2311 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2312 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 2313 %tmp3 = bitcast float* %tmp2 to <4 x float>* 2314 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 2315 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2316 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2317 %tmp7 = bitcast float* %tmp2 to <4 x float>* 2318 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4 2319 %tmp8 = add i64 %tmp, 4 2320 %tmp9 = icmp eq i64 %tmp8, 1024 2321 br i1 %tmp9, label %bb10, label %bb1 2322 2323bb10: ; preds = %bb1 2324 ret void 2325} 2326 2327define void @bcast_unfold_fmin_v8f32(float* %arg) { 2328; CHECK-LABEL: bcast_unfold_fmin_v8f32: 2329; CHECK: # %bb.0: # %bb 2330; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2331; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2332; CHECK-NEXT: .p2align 4, 0x90 2333; CHECK-NEXT: .LBB67_1: # %bb1 2334; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2335; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 2336; CHECK-NEXT: vminps %ymm0, %ymm1, %ymm1 2337; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) 2338; CHECK-NEXT: addq $32, %rax 2339; CHECK-NEXT: jne .LBB67_1 2340; CHECK-NEXT: # %bb.2: # %bb10 2341; CHECK-NEXT: vzeroupper 2342; CHECK-NEXT: retq 2343bb: 2344 br label %bb1 2345 2346bb1: ; preds = %bb1, %bb 2347 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2348 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 2349 %tmp3 = bitcast float* %tmp2 to <8 x float>* 2350 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 2351 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2352 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2353 %tmp7 = bitcast float* %tmp2 to <8 x float>* 2354 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4 2355 %tmp8 = add i64 %tmp, 8 2356 %tmp9 = icmp eq i64 %tmp8, 1024 2357 br i1 %tmp9, label %bb10, label %bb1 2358 2359bb10: ; preds = %bb1 2360 ret void 2361} 2362 2363define void @bcast_unfold_fmin_v16f32(float* %arg) { 2364; CHECK-LABEL: bcast_unfold_fmin_v16f32: 2365; CHECK: # %bb.0: # %bb 2366; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2367; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2368; CHECK-NEXT: .p2align 4, 0x90 2369; CHECK-NEXT: .LBB68_1: # %bb1 2370; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2371; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 2372; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm1 2373; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) 2374; CHECK-NEXT: addq $64, %rax 2375; CHECK-NEXT: jne .LBB68_1 2376; CHECK-NEXT: # %bb.2: # %bb10 2377; CHECK-NEXT: vzeroupper 2378; CHECK-NEXT: retq 2379bb: 2380 br label %bb1 2381 2382bb1: ; preds = %bb1, %bb 2383 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2384 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 2385 %tmp3 = bitcast float* %tmp2 to <16 x float>* 2386 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 2387 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2388 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 2389 %tmp7 = bitcast float* %tmp2 to <16 x float>* 2390 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4 2391 %tmp8 = add i64 %tmp, 16 2392 %tmp9 = icmp eq i64 %tmp8, 1024 2393 br i1 %tmp9, label %bb10, label %bb1 2394 2395bb10: ; preds = %bb1 2396 ret void 2397} 2398 2399define void @bcast_unfold_fmin_v2f64(double* %arg) { 2400; CHECK-LABEL: bcast_unfold_fmin_v2f64: 2401; CHECK: # %bb.0: # %bb 2402; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2403; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] 2404; CHECK-NEXT: .p2align 4, 0x90 2405; CHECK-NEXT: .LBB69_1: # %bb1 2406; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2407; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 2408; CHECK-NEXT: vminpd %xmm0, %xmm1, %xmm1 2409; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) 2410; CHECK-NEXT: addq $16, %rax 2411; CHECK-NEXT: jne .LBB69_1 2412; CHECK-NEXT: # %bb.2: # %bb10 2413; CHECK-NEXT: retq 2414bb: 2415 br label %bb1 2416 2417bb1: ; preds = %bb1, %bb 2418 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2419 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 2420 %tmp3 = bitcast double* %tmp2 to <2 x double>* 2421 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 2422 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00> 2423 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 2.000000e+00, double 2.000000e+00> 2424 %tmp7 = bitcast double* %tmp2 to <2 x double>* 2425 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8 2426 %tmp8 = add i64 %tmp, 2 2427 %tmp9 = icmp eq i64 %tmp8, 1024 2428 br i1 %tmp9, label %bb10, label %bb1 2429 2430bb10: ; preds = %bb1 2431 ret void 2432} 2433 2434define void @bcast_unfold_fmin_v4f64(double* %arg) { 2435; CHECK-LABEL: bcast_unfold_fmin_v4f64: 2436; CHECK: # %bb.0: # %bb 2437; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2438; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2439; CHECK-NEXT: .p2align 4, 0x90 2440; CHECK-NEXT: .LBB70_1: # %bb1 2441; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2442; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 2443; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm1 2444; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) 2445; CHECK-NEXT: addq $32, %rax 2446; CHECK-NEXT: jne .LBB70_1 2447; CHECK-NEXT: # %bb.2: # %bb10 2448; CHECK-NEXT: vzeroupper 2449; CHECK-NEXT: retq 2450bb: 2451 br label %bb1 2452 2453bb1: ; preds = %bb1, %bb 2454 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2455 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 2456 %tmp3 = bitcast double* %tmp2 to <4 x double>* 2457 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 2458 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2459 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2460 %tmp7 = bitcast double* %tmp2 to <4 x double>* 2461 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8 2462 %tmp8 = add i64 %tmp, 4 2463 %tmp9 = icmp eq i64 %tmp8, 1024 2464 br i1 %tmp9, label %bb10, label %bb1 2465 2466bb10: ; preds = %bb1 2467 ret void 2468} 2469 2470define void @bcast_unfold_fmin_v8f64(double* %arg) { 2471; CHECK-LABEL: bcast_unfold_fmin_v8f64: 2472; CHECK: # %bb.0: # %bb 2473; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2474; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 2475; CHECK-NEXT: .p2align 4, 0x90 2476; CHECK-NEXT: .LBB71_1: # %bb1 2477; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2478; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 2479; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm1 2480; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) 2481; CHECK-NEXT: addq $64, %rax 2482; CHECK-NEXT: jne .LBB71_1 2483; CHECK-NEXT: # %bb.2: # %bb10 2484; CHECK-NEXT: vzeroupper 2485; CHECK-NEXT: retq 2486bb: 2487 br label %bb1 2488 2489bb1: ; preds = %bb1, %bb 2490 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2491 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 2492 %tmp3 = bitcast double* %tmp2 to <8 x double>* 2493 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 2494 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2495 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 2496 %tmp7 = bitcast double* %tmp2 to <8 x double>* 2497 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8 2498 %tmp8 = add i64 %tmp, 8 2499 %tmp9 = icmp eq i64 %tmp8, 1024 2500 br i1 %tmp9, label %bb10, label %bb1 2501 2502bb10: ; preds = %bb1 2503 ret void 2504} 2505 2506define void @bcast_unfold_smin_v4i32(i32* %arg) { 2507; CHECK-LABEL: bcast_unfold_smin_v4i32: 2508; CHECK: # %bb.0: # %bb 2509; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2510; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 2511; CHECK-NEXT: .p2align 4, 0x90 2512; CHECK-NEXT: .LBB72_1: # %bb1 2513; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2514; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %xmm0, %xmm1 2515; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 2516; CHECK-NEXT: addq $16, %rax 2517; CHECK-NEXT: jne .LBB72_1 2518; CHECK-NEXT: # %bb.2: # %bb10 2519; CHECK-NEXT: retq 2520bb: 2521 br label %bb1 2522 2523bb1: ; preds = %bb1, %bb 2524 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2525 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2526 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 2527 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 2528 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> 2529 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 2530 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 2531 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 2532 %tmp8 = add i64 %tmp, 4 2533 %tmp9 = icmp eq i64 %tmp8, 1024 2534 br i1 %tmp9, label %bb10, label %bb1 2535 2536bb10: ; preds = %bb1 2537 ret void 2538} 2539 2540define void @bcast_unfold_smin_v8i32(i32* %arg) { 2541; CHECK-LABEL: bcast_unfold_smin_v8i32: 2542; CHECK: # %bb.0: # %bb 2543; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2544; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] 2545; CHECK-NEXT: .p2align 4, 0x90 2546; CHECK-NEXT: .LBB73_1: # %bb1 2547; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2548; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %ymm0, %ymm1 2549; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 2550; CHECK-NEXT: addq $32, %rax 2551; CHECK-NEXT: jne .LBB73_1 2552; CHECK-NEXT: # %bb.2: # %bb10 2553; CHECK-NEXT: vzeroupper 2554; CHECK-NEXT: retq 2555bb: 2556 br label %bb1 2557 2558bb1: ; preds = %bb1, %bb 2559 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2560 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2561 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 2562 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 2563 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2564 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2565 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 2566 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 2567 %tmp8 = add i64 %tmp, 8 2568 %tmp9 = icmp eq i64 %tmp8, 1024 2569 br i1 %tmp9, label %bb10, label %bb1 2570 2571bb10: ; preds = %bb1 2572 ret void 2573} 2574 2575define void @bcast_unfold_smin_v16i32(i32* %arg) { 2576; CHECK-LABEL: bcast_unfold_smin_v16i32: 2577; CHECK: # %bb.0: # %bb 2578; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2579; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2580; CHECK-NEXT: .p2align 4, 0x90 2581; CHECK-NEXT: .LBB74_1: # %bb1 2582; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2583; CHECK-NEXT: vpminsd 4096(%rdi,%rax), %zmm0, %zmm1 2584; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 2585; CHECK-NEXT: addq $64, %rax 2586; CHECK-NEXT: jne .LBB74_1 2587; CHECK-NEXT: # %bb.2: # %bb10 2588; CHECK-NEXT: vzeroupper 2589; CHECK-NEXT: retq 2590bb: 2591 br label %bb1 2592 2593bb1: ; preds = %bb1, %bb 2594 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2595 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2596 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 2597 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 2598 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2599 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2600 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 2601 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 2602 %tmp8 = add i64 %tmp, 16 2603 %tmp9 = icmp eq i64 %tmp8, 1024 2604 br i1 %tmp9, label %bb10, label %bb1 2605 2606bb10: ; preds = %bb1 2607 ret void 2608} 2609 2610define void @bcast_unfold_smin_v2i64(i64* %arg) { 2611; CHECK-LABEL: bcast_unfold_smin_v2i64: 2612; CHECK: # %bb.0: # %bb 2613; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2614; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] 2615; CHECK-NEXT: .p2align 4, 0x90 2616; CHECK-NEXT: .LBB75_1: # %bb1 2617; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2618; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %xmm0, %xmm1 2619; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) 2620; CHECK-NEXT: addq $16, %rax 2621; CHECK-NEXT: jne .LBB75_1 2622; CHECK-NEXT: # %bb.2: # %bb10 2623; CHECK-NEXT: retq 2624bb: 2625 br label %bb1 2626 2627bb1: ; preds = %bb1, %bb 2628 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2629 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 2630 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 2631 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8 2632 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 2, i64 2> 2633 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2> 2634 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 2635 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8 2636 %tmp8 = add i64 %tmp, 2 2637 %tmp9 = icmp eq i64 %tmp8, 1024 2638 br i1 %tmp9, label %bb10, label %bb1 2639 2640bb10: ; preds = %bb1 2641 ret void 2642} 2643 2644define void @bcast_unfold_smin_v4i64(i64* %arg) { 2645; CHECK-LABEL: bcast_unfold_smin_v4i64: 2646; CHECK: # %bb.0: # %bb 2647; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2648; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 2649; CHECK-NEXT: .p2align 4, 0x90 2650; CHECK-NEXT: .LBB76_1: # %bb1 2651; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2652; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %ymm0, %ymm1 2653; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 2654; CHECK-NEXT: addq $32, %rax 2655; CHECK-NEXT: jne .LBB76_1 2656; CHECK-NEXT: # %bb.2: # %bb10 2657; CHECK-NEXT: vzeroupper 2658; CHECK-NEXT: retq 2659bb: 2660 br label %bb1 2661 2662bb1: ; preds = %bb1, %bb 2663 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2664 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 2665 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 2666 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 2667 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> 2668 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2> 2669 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 2670 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 2671 %tmp8 = add i64 %tmp, 4 2672 %tmp9 = icmp eq i64 %tmp8, 1024 2673 br i1 %tmp9, label %bb10, label %bb1 2674 2675bb10: ; preds = %bb1 2676 ret void 2677} 2678 2679define void @bcast_unfold_smin_v8i64(i64* %arg) { 2680; CHECK-LABEL: bcast_unfold_smin_v8i64: 2681; CHECK: # %bb.0: # %bb 2682; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2683; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] 2684; CHECK-NEXT: .p2align 4, 0x90 2685; CHECK-NEXT: .LBB77_1: # %bb1 2686; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2687; CHECK-NEXT: vpminsq 8192(%rdi,%rax), %zmm0, %zmm1 2688; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 2689; CHECK-NEXT: addq $64, %rax 2690; CHECK-NEXT: jne .LBB77_1 2691; CHECK-NEXT: # %bb.2: # %bb10 2692; CHECK-NEXT: vzeroupper 2693; CHECK-NEXT: retq 2694bb: 2695 br label %bb1 2696 2697bb1: ; preds = %bb1, %bb 2698 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2699 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 2700 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 2701 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8 2702 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 2703 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 2704 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 2705 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8 2706 %tmp8 = add i64 %tmp, 8 2707 %tmp9 = icmp eq i64 %tmp8, 1024 2708 br i1 %tmp9, label %bb10, label %bb1 2709 2710bb10: ; preds = %bb1 2711 ret void 2712} 2713 2714define void @bcast_unfold_smax_v4i32(i32* %arg) { 2715; CHECK-LABEL: bcast_unfold_smax_v4i32: 2716; CHECK: # %bb.0: # %bb 2717; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2718; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 2719; CHECK-NEXT: .p2align 4, 0x90 2720; CHECK-NEXT: .LBB78_1: # %bb1 2721; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2722; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %xmm0, %xmm1 2723; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 2724; CHECK-NEXT: addq $16, %rax 2725; CHECK-NEXT: jne .LBB78_1 2726; CHECK-NEXT: # %bb.2: # %bb10 2727; CHECK-NEXT: retq 2728bb: 2729 br label %bb1 2730 2731bb1: ; preds = %bb1, %bb 2732 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2733 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2734 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 2735 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 2736 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> 2737 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 2738 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 2739 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 2740 %tmp8 = add i64 %tmp, 4 2741 %tmp9 = icmp eq i64 %tmp8, 1024 2742 br i1 %tmp9, label %bb10, label %bb1 2743 2744bb10: ; preds = %bb1 2745 ret void 2746} 2747 2748define void @bcast_unfold_smax_v8i32(i32* %arg) { 2749; CHECK-LABEL: bcast_unfold_smax_v8i32: 2750; CHECK: # %bb.0: # %bb 2751; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2752; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] 2753; CHECK-NEXT: .p2align 4, 0x90 2754; CHECK-NEXT: .LBB79_1: # %bb1 2755; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2756; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %ymm0, %ymm1 2757; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 2758; CHECK-NEXT: addq $32, %rax 2759; CHECK-NEXT: jne .LBB79_1 2760; CHECK-NEXT: # %bb.2: # %bb10 2761; CHECK-NEXT: vzeroupper 2762; CHECK-NEXT: retq 2763bb: 2764 br label %bb1 2765 2766bb1: ; preds = %bb1, %bb 2767 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2768 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2769 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 2770 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 2771 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2772 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2773 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 2774 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 2775 %tmp8 = add i64 %tmp, 8 2776 %tmp9 = icmp eq i64 %tmp8, 1024 2777 br i1 %tmp9, label %bb10, label %bb1 2778 2779bb10: ; preds = %bb1 2780 ret void 2781} 2782 2783define void @bcast_unfold_smax_v16i32(i32* %arg) { 2784; CHECK-LABEL: bcast_unfold_smax_v16i32: 2785; CHECK: # %bb.0: # %bb 2786; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2787; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2788; CHECK-NEXT: .p2align 4, 0x90 2789; CHECK-NEXT: .LBB80_1: # %bb1 2790; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2791; CHECK-NEXT: vpmaxsd 4096(%rdi,%rax), %zmm0, %zmm1 2792; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 2793; CHECK-NEXT: addq $64, %rax 2794; CHECK-NEXT: jne .LBB80_1 2795; CHECK-NEXT: # %bb.2: # %bb10 2796; CHECK-NEXT: vzeroupper 2797; CHECK-NEXT: retq 2798bb: 2799 br label %bb1 2800 2801bb1: ; preds = %bb1, %bb 2802 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2803 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2804 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 2805 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 2806 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2807 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2808 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 2809 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 2810 %tmp8 = add i64 %tmp, 16 2811 %tmp9 = icmp eq i64 %tmp8, 1024 2812 br i1 %tmp9, label %bb10, label %bb1 2813 2814bb10: ; preds = %bb1 2815 ret void 2816} 2817 2818define void @bcast_unfold_smax_v2i64(i64* %arg) { 2819; CHECK-LABEL: bcast_unfold_smax_v2i64: 2820; CHECK: # %bb.0: # %bb 2821; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2822; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] 2823; CHECK-NEXT: .p2align 4, 0x90 2824; CHECK-NEXT: .LBB81_1: # %bb1 2825; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2826; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %xmm0, %xmm1 2827; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) 2828; CHECK-NEXT: addq $16, %rax 2829; CHECK-NEXT: jne .LBB81_1 2830; CHECK-NEXT: # %bb.2: # %bb10 2831; CHECK-NEXT: retq 2832bb: 2833 br label %bb1 2834 2835bb1: ; preds = %bb1, %bb 2836 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2837 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 2838 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 2839 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8 2840 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 2, i64 2> 2841 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2> 2842 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 2843 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8 2844 %tmp8 = add i64 %tmp, 2 2845 %tmp9 = icmp eq i64 %tmp8, 1024 2846 br i1 %tmp9, label %bb10, label %bb1 2847 2848bb10: ; preds = %bb1 2849 ret void 2850} 2851 2852define void @bcast_unfold_smax_v4i64(i64* %arg) { 2853; CHECK-LABEL: bcast_unfold_smax_v4i64: 2854; CHECK: # %bb.0: # %bb 2855; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2856; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 2857; CHECK-NEXT: .p2align 4, 0x90 2858; CHECK-NEXT: .LBB82_1: # %bb1 2859; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2860; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %ymm0, %ymm1 2861; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 2862; CHECK-NEXT: addq $32, %rax 2863; CHECK-NEXT: jne .LBB82_1 2864; CHECK-NEXT: # %bb.2: # %bb10 2865; CHECK-NEXT: vzeroupper 2866; CHECK-NEXT: retq 2867bb: 2868 br label %bb1 2869 2870bb1: ; preds = %bb1, %bb 2871 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2872 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 2873 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 2874 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 2875 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> 2876 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2> 2877 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 2878 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 2879 %tmp8 = add i64 %tmp, 4 2880 %tmp9 = icmp eq i64 %tmp8, 1024 2881 br i1 %tmp9, label %bb10, label %bb1 2882 2883bb10: ; preds = %bb1 2884 ret void 2885} 2886 2887define void @bcast_unfold_smax_v8i64(i64* %arg) { 2888; CHECK-LABEL: bcast_unfold_smax_v8i64: 2889; CHECK: # %bb.0: # %bb 2890; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 2891; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] 2892; CHECK-NEXT: .p2align 4, 0x90 2893; CHECK-NEXT: .LBB83_1: # %bb1 2894; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2895; CHECK-NEXT: vpmaxsq 8192(%rdi,%rax), %zmm0, %zmm1 2896; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 2897; CHECK-NEXT: addq $64, %rax 2898; CHECK-NEXT: jne .LBB83_1 2899; CHECK-NEXT: # %bb.2: # %bb10 2900; CHECK-NEXT: vzeroupper 2901; CHECK-NEXT: retq 2902bb: 2903 br label %bb1 2904 2905bb1: ; preds = %bb1, %bb 2906 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2907 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 2908 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 2909 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8 2910 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 2911 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 2912 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 2913 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8 2914 %tmp8 = add i64 %tmp, 8 2915 %tmp9 = icmp eq i64 %tmp8, 1024 2916 br i1 %tmp9, label %bb10, label %bb1 2917 2918bb10: ; preds = %bb1 2919 ret void 2920} 2921 2922define void @bcast_unfold_umin_v4i32(i32* %arg) { 2923; CHECK-LABEL: bcast_unfold_umin_v4i32: 2924; CHECK: # %bb.0: # %bb 2925; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2926; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 2927; CHECK-NEXT: .p2align 4, 0x90 2928; CHECK-NEXT: .LBB84_1: # %bb1 2929; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2930; CHECK-NEXT: vpminud 4096(%rdi,%rax), %xmm0, %xmm1 2931; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 2932; CHECK-NEXT: addq $16, %rax 2933; CHECK-NEXT: jne .LBB84_1 2934; CHECK-NEXT: # %bb.2: # %bb10 2935; CHECK-NEXT: retq 2936bb: 2937 br label %bb1 2938 2939bb1: ; preds = %bb1, %bb 2940 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2941 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2942 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 2943 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 2944 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> 2945 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 2946 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 2947 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 2948 %tmp8 = add i64 %tmp, 4 2949 %tmp9 = icmp eq i64 %tmp8, 1024 2950 br i1 %tmp9, label %bb10, label %bb1 2951 2952bb10: ; preds = %bb1 2953 ret void 2954} 2955 2956define void @bcast_unfold_umin_v8i32(i32* %arg) { 2957; CHECK-LABEL: bcast_unfold_umin_v8i32: 2958; CHECK: # %bb.0: # %bb 2959; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2960; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] 2961; CHECK-NEXT: .p2align 4, 0x90 2962; CHECK-NEXT: .LBB85_1: # %bb1 2963; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2964; CHECK-NEXT: vpminud 4096(%rdi,%rax), %ymm0, %ymm1 2965; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 2966; CHECK-NEXT: addq $32, %rax 2967; CHECK-NEXT: jne .LBB85_1 2968; CHECK-NEXT: # %bb.2: # %bb10 2969; CHECK-NEXT: vzeroupper 2970; CHECK-NEXT: retq 2971bb: 2972 br label %bb1 2973 2974bb1: ; preds = %bb1, %bb 2975 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 2976 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 2977 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 2978 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 2979 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2980 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2981 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 2982 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 2983 %tmp8 = add i64 %tmp, 8 2984 %tmp9 = icmp eq i64 %tmp8, 1024 2985 br i1 %tmp9, label %bb10, label %bb1 2986 2987bb10: ; preds = %bb1 2988 ret void 2989} 2990 2991define void @bcast_unfold_umin_v16i32(i32* %arg) { 2992; CHECK-LABEL: bcast_unfold_umin_v16i32: 2993; CHECK: # %bb.0: # %bb 2994; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 2995; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2996; CHECK-NEXT: .p2align 4, 0x90 2997; CHECK-NEXT: .LBB86_1: # %bb1 2998; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 2999; CHECK-NEXT: vpminud 4096(%rdi,%rax), %zmm0, %zmm1 3000; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 3001; CHECK-NEXT: addq $64, %rax 3002; CHECK-NEXT: jne .LBB86_1 3003; CHECK-NEXT: # %bb.2: # %bb10 3004; CHECK-NEXT: vzeroupper 3005; CHECK-NEXT: retq 3006bb: 3007 br label %bb1 3008 3009bb1: ; preds = %bb1, %bb 3010 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3011 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3012 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 3013 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 3014 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 3015 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 3016 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 3017 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 3018 %tmp8 = add i64 %tmp, 16 3019 %tmp9 = icmp eq i64 %tmp8, 1024 3020 br i1 %tmp9, label %bb10, label %bb1 3021 3022bb10: ; preds = %bb1 3023 ret void 3024} 3025 3026define void @bcast_unfold_umin_v2i64(i64* %arg) { 3027; CHECK-LABEL: bcast_unfold_umin_v2i64: 3028; CHECK: # %bb.0: # %bb 3029; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3030; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] 3031; CHECK-NEXT: .p2align 4, 0x90 3032; CHECK-NEXT: .LBB87_1: # %bb1 3033; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3034; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %xmm0, %xmm1 3035; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) 3036; CHECK-NEXT: addq $16, %rax 3037; CHECK-NEXT: jne .LBB87_1 3038; CHECK-NEXT: # %bb.2: # %bb10 3039; CHECK-NEXT: retq 3040bb: 3041 br label %bb1 3042 3043bb1: ; preds = %bb1, %bb 3044 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3045 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3046 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 3047 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8 3048 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2> 3049 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2> 3050 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 3051 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8 3052 %tmp8 = add i64 %tmp, 2 3053 %tmp9 = icmp eq i64 %tmp8, 1024 3054 br i1 %tmp9, label %bb10, label %bb1 3055 3056bb10: ; preds = %bb1 3057 ret void 3058} 3059 3060define void @bcast_unfold_umin_v4i64(i64* %arg) { 3061; CHECK-LABEL: bcast_unfold_umin_v4i64: 3062; CHECK: # %bb.0: # %bb 3063; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3064; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 3065; CHECK-NEXT: .p2align 4, 0x90 3066; CHECK-NEXT: .LBB88_1: # %bb1 3067; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3068; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %ymm0, %ymm1 3069; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 3070; CHECK-NEXT: addq $32, %rax 3071; CHECK-NEXT: jne .LBB88_1 3072; CHECK-NEXT: # %bb.2: # %bb10 3073; CHECK-NEXT: vzeroupper 3074; CHECK-NEXT: retq 3075bb: 3076 br label %bb1 3077 3078bb1: ; preds = %bb1, %bb 3079 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3080 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3081 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 3082 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 3083 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> 3084 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2> 3085 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 3086 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 3087 %tmp8 = add i64 %tmp, 4 3088 %tmp9 = icmp eq i64 %tmp8, 1024 3089 br i1 %tmp9, label %bb10, label %bb1 3090 3091bb10: ; preds = %bb1 3092 ret void 3093} 3094 3095define void @bcast_unfold_umin_v8i64(i64* %arg) { 3096; CHECK-LABEL: bcast_unfold_umin_v8i64: 3097; CHECK: # %bb.0: # %bb 3098; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3099; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] 3100; CHECK-NEXT: .p2align 4, 0x90 3101; CHECK-NEXT: .LBB89_1: # %bb1 3102; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3103; CHECK-NEXT: vpminuq 8192(%rdi,%rax), %zmm0, %zmm1 3104; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 3105; CHECK-NEXT: addq $64, %rax 3106; CHECK-NEXT: jne .LBB89_1 3107; CHECK-NEXT: # %bb.2: # %bb10 3108; CHECK-NEXT: vzeroupper 3109; CHECK-NEXT: retq 3110bb: 3111 br label %bb1 3112 3113bb1: ; preds = %bb1, %bb 3114 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3115 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3116 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 3117 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8 3118 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 3119 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 3120 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 3121 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8 3122 %tmp8 = add i64 %tmp, 8 3123 %tmp9 = icmp eq i64 %tmp8, 1024 3124 br i1 %tmp9, label %bb10, label %bb1 3125 3126bb10: ; preds = %bb1 3127 ret void 3128} 3129 3130define void @bcast_unfold_umax_v4i32(i32* %arg) { 3131; CHECK-LABEL: bcast_unfold_umax_v4i32: 3132; CHECK: # %bb.0: # %bb 3133; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3134; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 3135; CHECK-NEXT: .p2align 4, 0x90 3136; CHECK-NEXT: .LBB90_1: # %bb1 3137; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3138; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %xmm0, %xmm1 3139; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 3140; CHECK-NEXT: addq $16, %rax 3141; CHECK-NEXT: jne .LBB90_1 3142; CHECK-NEXT: # %bb.2: # %bb10 3143; CHECK-NEXT: retq 3144bb: 3145 br label %bb1 3146 3147bb1: ; preds = %bb1, %bb 3148 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3149 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3150 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 3151 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 3152 %tmp5 = icmp ugt <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> 3153 %tmp6 = select <4 x i1> %tmp5, <4 x i32> %tmp4, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 3154 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 3155 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 3156 %tmp8 = add i64 %tmp, 4 3157 %tmp9 = icmp eq i64 %tmp8, 1024 3158 br i1 %tmp9, label %bb10, label %bb1 3159 3160bb10: ; preds = %bb1 3161 ret void 3162} 3163 3164define void @bcast_unfold_umax_v8i32(i32* %arg) { 3165; CHECK-LABEL: bcast_unfold_umax_v8i32: 3166; CHECK: # %bb.0: # %bb 3167; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3168; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] 3169; CHECK-NEXT: .p2align 4, 0x90 3170; CHECK-NEXT: .LBB91_1: # %bb1 3171; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3172; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %ymm0, %ymm1 3173; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 3174; CHECK-NEXT: addq $32, %rax 3175; CHECK-NEXT: jne .LBB91_1 3176; CHECK-NEXT: # %bb.2: # %bb10 3177; CHECK-NEXT: vzeroupper 3178; CHECK-NEXT: retq 3179bb: 3180 br label %bb1 3181 3182bb1: ; preds = %bb1, %bb 3183 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3184 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3185 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 3186 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 3187 %tmp5 = icmp ugt <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 3188 %tmp6 = select <8 x i1> %tmp5, <8 x i32> %tmp4, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 3189 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 3190 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 3191 %tmp8 = add i64 %tmp, 8 3192 %tmp9 = icmp eq i64 %tmp8, 1024 3193 br i1 %tmp9, label %bb10, label %bb1 3194 3195bb10: ; preds = %bb1 3196 ret void 3197} 3198 3199define void @bcast_unfold_umax_v16i32(i32* %arg) { 3200; CHECK-LABEL: bcast_unfold_umax_v16i32: 3201; CHECK: # %bb.0: # %bb 3202; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3203; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 3204; CHECK-NEXT: .p2align 4, 0x90 3205; CHECK-NEXT: .LBB92_1: # %bb1 3206; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3207; CHECK-NEXT: vpmaxud 4096(%rdi,%rax), %zmm0, %zmm1 3208; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 3209; CHECK-NEXT: addq $64, %rax 3210; CHECK-NEXT: jne .LBB92_1 3211; CHECK-NEXT: # %bb.2: # %bb10 3212; CHECK-NEXT: vzeroupper 3213; CHECK-NEXT: retq 3214bb: 3215 br label %bb1 3216 3217bb1: ; preds = %bb1, %bb 3218 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3219 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3220 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 3221 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 3222 %tmp5 = icmp ugt <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 3223 %tmp6 = select <16 x i1> %tmp5, <16 x i32> %tmp4, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 3224 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 3225 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 3226 %tmp8 = add i64 %tmp, 16 3227 %tmp9 = icmp eq i64 %tmp8, 1024 3228 br i1 %tmp9, label %bb10, label %bb1 3229 3230bb10: ; preds = %bb1 3231 ret void 3232} 3233 3234define void @bcast_unfold_umax_v2i64(i64* %arg) { 3235; CHECK-LABEL: bcast_unfold_umax_v2i64: 3236; CHECK: # %bb.0: # %bb 3237; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3238; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] 3239; CHECK-NEXT: .p2align 4, 0x90 3240; CHECK-NEXT: .LBB93_1: # %bb1 3241; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3242; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %xmm0, %xmm1 3243; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) 3244; CHECK-NEXT: addq $16, %rax 3245; CHECK-NEXT: jne .LBB93_1 3246; CHECK-NEXT: # %bb.2: # %bb10 3247; CHECK-NEXT: retq 3248bb: 3249 br label %bb1 3250 3251bb1: ; preds = %bb1, %bb 3252 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3253 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3254 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 3255 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 8 3256 %tmp5 = icmp ugt <2 x i64> %tmp4, <i64 2, i64 2> 3257 %tmp6 = select <2 x i1> %tmp5, <2 x i64> %tmp4, <2 x i64> <i64 2, i64 2> 3258 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 3259 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 8 3260 %tmp8 = add i64 %tmp, 2 3261 %tmp9 = icmp eq i64 %tmp8, 1024 3262 br i1 %tmp9, label %bb10, label %bb1 3263 3264bb10: ; preds = %bb1 3265 ret void 3266} 3267 3268define void @bcast_unfold_umax_v4i64(i64* %arg) { 3269; CHECK-LABEL: bcast_unfold_umax_v4i64: 3270; CHECK: # %bb.0: # %bb 3271; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3272; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 3273; CHECK-NEXT: .p2align 4, 0x90 3274; CHECK-NEXT: .LBB94_1: # %bb1 3275; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3276; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %ymm0, %ymm1 3277; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 3278; CHECK-NEXT: addq $32, %rax 3279; CHECK-NEXT: jne .LBB94_1 3280; CHECK-NEXT: # %bb.2: # %bb10 3281; CHECK-NEXT: vzeroupper 3282; CHECK-NEXT: retq 3283bb: 3284 br label %bb1 3285 3286bb1: ; preds = %bb1, %bb 3287 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3288 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3289 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 3290 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 3291 %tmp5 = icmp ugt <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> 3292 %tmp6 = select <4 x i1> %tmp5, <4 x i64> %tmp4, <4 x i64> <i64 2, i64 2, i64 2, i64 2> 3293 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 3294 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 3295 %tmp8 = add i64 %tmp, 4 3296 %tmp9 = icmp eq i64 %tmp8, 1024 3297 br i1 %tmp9, label %bb10, label %bb1 3298 3299bb10: ; preds = %bb1 3300 ret void 3301} 3302 3303define void @bcast_unfold_umax_v8i64(i64* %arg) { 3304; CHECK-LABEL: bcast_unfold_umax_v8i64: 3305; CHECK: # %bb.0: # %bb 3306; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3307; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] 3308; CHECK-NEXT: .p2align 4, 0x90 3309; CHECK-NEXT: .LBB95_1: # %bb1 3310; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3311; CHECK-NEXT: vpmaxuq 8192(%rdi,%rax), %zmm0, %zmm1 3312; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 3313; CHECK-NEXT: addq $64, %rax 3314; CHECK-NEXT: jne .LBB95_1 3315; CHECK-NEXT: # %bb.2: # %bb10 3316; CHECK-NEXT: vzeroupper 3317; CHECK-NEXT: retq 3318bb: 3319 br label %bb1 3320 3321bb1: ; preds = %bb1, %bb 3322 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3323 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3324 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 3325 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 8 3326 %tmp5 = icmp ugt <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 3327 %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 3328 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 3329 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 8 3330 %tmp8 = add i64 %tmp, 8 3331 %tmp9 = icmp eq i64 %tmp8, 1024 3332 br i1 %tmp9, label %bb10, label %bb1 3333 3334bb10: ; preds = %bb1 3335 ret void 3336} 3337 3338define void @bcast_unfold_pcmpgt_v4i32(i32* %arg) { 3339; CHECK-LABEL: bcast_unfold_pcmpgt_v4i32: 3340; CHECK: # %bb.0: # %bb 3341; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3342; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] 3343; CHECK-NEXT: .p2align 4, 0x90 3344; CHECK-NEXT: .LBB96_1: # %bb1 3345; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3346; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 3347; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 3348; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} 3349; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 3350; CHECK-NEXT: addq $16, %rax 3351; CHECK-NEXT: jne .LBB96_1 3352; CHECK-NEXT: # %bb.2: # %bb10 3353; CHECK-NEXT: retq 3354bb: 3355 br label %bb1 3356 3357bb1: ; preds = %bb1, %bb 3358 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3359 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3360 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 3361 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 3362 %tmp5 = icmp sgt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1> 3363 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 3364 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 3365 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 3366 %tmp8 = add i64 %tmp, 4 3367 %tmp9 = icmp eq i64 %tmp8, 1024 3368 br i1 %tmp9, label %bb10, label %bb1 3369 3370bb10: ; preds = %bb1 3371 ret void 3372} 3373 3374define void @bcast_unfold_pcmpgt_v8i32(i32* %arg) { 3375; CHECK-LABEL: bcast_unfold_pcmpgt_v8i32: 3376; CHECK: # %bb.0: # %bb 3377; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3378; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] 3379; CHECK-NEXT: .p2align 4, 0x90 3380; CHECK-NEXT: .LBB97_1: # %bb1 3381; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3382; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 3383; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 3384; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1} 3385; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 3386; CHECK-NEXT: addq $32, %rax 3387; CHECK-NEXT: jne .LBB97_1 3388; CHECK-NEXT: # %bb.2: # %bb10 3389; CHECK-NEXT: vzeroupper 3390; CHECK-NEXT: retq 3391bb: 3392 br label %bb1 3393 3394bb1: ; preds = %bb1, %bb 3395 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3396 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3397 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 3398 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 3399 %tmp5 = icmp sgt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3400 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4 3401 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 3402 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 3403 %tmp8 = add i64 %tmp, 8 3404 %tmp9 = icmp eq i64 %tmp8, 1024 3405 br i1 %tmp9, label %bb10, label %bb1 3406 3407bb10: ; preds = %bb1 3408 ret void 3409} 3410 3411define void @bcast_unfold_pcmpgt_v16i32(i32* %arg) { 3412; CHECK-LABEL: bcast_unfold_pcmpgt_v16i32: 3413; CHECK: # %bb.0: # %bb 3414; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3415; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3416; CHECK-NEXT: .p2align 4, 0x90 3417; CHECK-NEXT: .LBB98_1: # %bb1 3418; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3419; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 3420; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 3421; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} 3422; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 3423; CHECK-NEXT: addq $64, %rax 3424; CHECK-NEXT: jne .LBB98_1 3425; CHECK-NEXT: # %bb.2: # %bb10 3426; CHECK-NEXT: vzeroupper 3427; CHECK-NEXT: retq 3428bb: 3429 br label %bb1 3430 3431bb1: ; preds = %bb1, %bb 3432 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3433 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3434 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 3435 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 3436 %tmp5 = icmp sgt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3437 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4 3438 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 3439 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 3440 %tmp8 = add i64 %tmp, 16 3441 %tmp9 = icmp eq i64 %tmp8, 1024 3442 br i1 %tmp9, label %bb10, label %bb1 3443 3444bb10: ; preds = %bb1 3445 ret void 3446} 3447 3448define void @bcast_unfold_pcmpgt_v2i64(i64* %arg) { 3449; CHECK-LABEL: bcast_unfold_pcmpgt_v2i64: 3450; CHECK: # %bb.0: # %bb 3451; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3452; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1] 3453; CHECK-NEXT: .p2align 4, 0x90 3454; CHECK-NEXT: .LBB99_1: # %bb1 3455; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3456; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 3457; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 3458; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1} 3459; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) 3460; CHECK-NEXT: addq $16, %rax 3461; CHECK-NEXT: jne .LBB99_1 3462; CHECK-NEXT: # %bb.2: # %bb10 3463; CHECK-NEXT: retq 3464bb: 3465 br label %bb1 3466 3467bb1: ; preds = %bb1, %bb 3468 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3469 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3470 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 3471 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4 3472 %tmp5 = icmp sgt <2 x i64> %tmp4, <i64 1, i64 1> 3473 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4 3474 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 3475 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4 3476 %tmp8 = add i64 %tmp, 2 3477 %tmp9 = icmp eq i64 %tmp8, 1024 3478 br i1 %tmp9, label %bb10, label %bb1 3479 3480bb10: ; preds = %bb1 3481 ret void 3482} 3483define void @bcast_unfold_pcmpgt_v4i64(i64* %arg) { 3484; CHECK-LABEL: bcast_unfold_pcmpgt_v4i64: 3485; CHECK: # %bb.0: # %bb 3486; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3487; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] 3488; CHECK-NEXT: .p2align 4, 0x90 3489; CHECK-NEXT: .LBB100_1: # %bb1 3490; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3491; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 3492; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 3493; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} 3494; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 3495; CHECK-NEXT: addq $32, %rax 3496; CHECK-NEXT: jne .LBB100_1 3497; CHECK-NEXT: # %bb.2: # %bb10 3498; CHECK-NEXT: vzeroupper 3499; CHECK-NEXT: retq 3500bb: 3501 br label %bb1 3502 3503bb1: ; preds = %bb1, %bb 3504 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3505 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3506 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 3507 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4 3508 %tmp5 = icmp sgt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1> 3509 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 3510 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 3511 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4 3512 %tmp8 = add i64 %tmp, 4 3513 %tmp9 = icmp eq i64 %tmp8, 1024 3514 br i1 %tmp9, label %bb10, label %bb1 3515 3516bb10: ; preds = %bb1 3517 ret void 3518} 3519 3520define void @bcast_unfold_pcmpgt_v8i64(i64* %arg) { 3521; CHECK-LABEL: bcast_unfold_pcmpgt_v8i64: 3522; CHECK: # %bb.0: # %bb 3523; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3524; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] 3525; CHECK-NEXT: .p2align 4, 0x90 3526; CHECK-NEXT: .LBB101_1: # %bb1 3527; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3528; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 3529; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 3530; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} 3531; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 3532; CHECK-NEXT: addq $64, %rax 3533; CHECK-NEXT: jne .LBB101_1 3534; CHECK-NEXT: # %bb.2: # %bb10 3535; CHECK-NEXT: vzeroupper 3536; CHECK-NEXT: retq 3537bb: 3538 br label %bb1 3539 3540bb1: ; preds = %bb1, %bb 3541 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3542 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3543 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 3544 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4 3545 %tmp5 = icmp sgt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 3546 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4 3547 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 3548 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4 3549 %tmp8 = add i64 %tmp, 8 3550 %tmp9 = icmp eq i64 %tmp8, 1024 3551 br i1 %tmp9, label %bb10, label %bb1 3552 3553bb10: ; preds = %bb1 3554 ret void 3555} 3556 3557define void @bcast_unfold_pcmpeq_v4i32(i32* %arg) { 3558; CHECK-LABEL: bcast_unfold_pcmpeq_v4i32: 3559; CHECK: # %bb.0: # %bb 3560; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3561; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] 3562; CHECK-NEXT: .p2align 4, 0x90 3563; CHECK-NEXT: .LBB102_1: # %bb1 3564; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3565; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 3566; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 3567; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} 3568; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 3569; CHECK-NEXT: addq $16, %rax 3570; CHECK-NEXT: jne .LBB102_1 3571; CHECK-NEXT: # %bb.2: # %bb10 3572; CHECK-NEXT: retq 3573bb: 3574 br label %bb1 3575 3576bb1: ; preds = %bb1, %bb 3577 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3578 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3579 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 3580 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 3581 %tmp5 = icmp eq <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1> 3582 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 3583 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 3584 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 3585 %tmp8 = add i64 %tmp, 4 3586 %tmp9 = icmp eq i64 %tmp8, 1024 3587 br i1 %tmp9, label %bb10, label %bb1 3588 3589bb10: ; preds = %bb1 3590 ret void 3591} 3592 3593define void @bcast_unfold_pcmpeq_v8i32(i32* %arg) { 3594; CHECK-LABEL: bcast_unfold_pcmpeq_v8i32: 3595; CHECK: # %bb.0: # %bb 3596; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3597; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] 3598; CHECK-NEXT: .p2align 4, 0x90 3599; CHECK-NEXT: .LBB103_1: # %bb1 3600; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3601; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %ymm1 3602; CHECK-NEXT: vpcmpeqd %ymm0, %ymm1, %k1 3603; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1} 3604; CHECK-NEXT: vmovdqu %ymm1, 4096(%rdi,%rax) 3605; CHECK-NEXT: addq $32, %rax 3606; CHECK-NEXT: jne .LBB103_1 3607; CHECK-NEXT: # %bb.2: # %bb10 3608; CHECK-NEXT: vzeroupper 3609; CHECK-NEXT: retq 3610bb: 3611 br label %bb1 3612 3613bb1: ; preds = %bb1, %bb 3614 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3615 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3616 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 3617 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 3618 %tmp5 = icmp eq <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3619 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4 3620 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 3621 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 3622 %tmp8 = add i64 %tmp, 8 3623 %tmp9 = icmp eq i64 %tmp8, 1024 3624 br i1 %tmp9, label %bb10, label %bb1 3625 3626bb10: ; preds = %bb1 3627 ret void 3628} 3629 3630define void @bcast_unfold_pcmpeq_v16i32(i32* %arg) { 3631; CHECK-LABEL: bcast_unfold_pcmpeq_v16i32: 3632; CHECK: # %bb.0: # %bb 3633; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 3634; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3635; CHECK-NEXT: .p2align 4, 0x90 3636; CHECK-NEXT: .LBB104_1: # %bb1 3637; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3638; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 3639; CHECK-NEXT: vpcmpeqd %zmm0, %zmm1, %k1 3640; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} 3641; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 3642; CHECK-NEXT: addq $64, %rax 3643; CHECK-NEXT: jne .LBB104_1 3644; CHECK-NEXT: # %bb.2: # %bb10 3645; CHECK-NEXT: vzeroupper 3646; CHECK-NEXT: retq 3647bb: 3648 br label %bb1 3649 3650bb1: ; preds = %bb1, %bb 3651 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3652 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3653 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 3654 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 3655 %tmp5 = icmp eq <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3656 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4 3657 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 3658 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 3659 %tmp8 = add i64 %tmp, 16 3660 %tmp9 = icmp eq i64 %tmp8, 1024 3661 br i1 %tmp9, label %bb10, label %bb1 3662 3663bb10: ; preds = %bb1 3664 ret void 3665} 3666 3667define void @bcast_unfold_pcmpeq_v2i64(i64* %arg) { 3668; CHECK-LABEL: bcast_unfold_pcmpeq_v2i64: 3669; CHECK: # %bb.0: # %bb 3670; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3671; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1] 3672; CHECK-NEXT: .p2align 4, 0x90 3673; CHECK-NEXT: .LBB105_1: # %bb1 3674; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3675; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm1 3676; CHECK-NEXT: vpcmpeqq %xmm0, %xmm1, %k1 3677; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1} 3678; CHECK-NEXT: vmovdqu %xmm1, 8192(%rdi,%rax) 3679; CHECK-NEXT: addq $16, %rax 3680; CHECK-NEXT: jne .LBB105_1 3681; CHECK-NEXT: # %bb.2: # %bb10 3682; CHECK-NEXT: retq 3683bb: 3684 br label %bb1 3685 3686bb1: ; preds = %bb1, %bb 3687 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3688 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3689 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 3690 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4 3691 %tmp5 = icmp eq <2 x i64> %tmp4, <i64 1, i64 1> 3692 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4 3693 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 3694 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4 3695 %tmp8 = add i64 %tmp, 2 3696 %tmp9 = icmp eq i64 %tmp8, 1024 3697 br i1 %tmp9, label %bb10, label %bb1 3698 3699bb10: ; preds = %bb1 3700 ret void 3701} 3702define void @bcast_unfold_pcmpeq_v4i64(i64* %arg) { 3703; CHECK-LABEL: bcast_unfold_pcmpeq_v4i64: 3704; CHECK: # %bb.0: # %bb 3705; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3706; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] 3707; CHECK-NEXT: .p2align 4, 0x90 3708; CHECK-NEXT: .LBB106_1: # %bb1 3709; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3710; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 3711; CHECK-NEXT: vpcmpeqq %ymm0, %ymm1, %k1 3712; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} 3713; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 3714; CHECK-NEXT: addq $32, %rax 3715; CHECK-NEXT: jne .LBB106_1 3716; CHECK-NEXT: # %bb.2: # %bb10 3717; CHECK-NEXT: vzeroupper 3718; CHECK-NEXT: retq 3719bb: 3720 br label %bb1 3721 3722bb1: ; preds = %bb1, %bb 3723 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3724 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3725 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 3726 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4 3727 %tmp5 = icmp eq <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1> 3728 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 3729 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 3730 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4 3731 %tmp8 = add i64 %tmp, 4 3732 %tmp9 = icmp eq i64 %tmp8, 1024 3733 br i1 %tmp9, label %bb10, label %bb1 3734 3735bb10: ; preds = %bb1 3736 ret void 3737} 3738 3739define void @bcast_unfold_pcmpeq_v8i64(i64* %arg) { 3740; CHECK-LABEL: bcast_unfold_pcmpeq_v8i64: 3741; CHECK: # %bb.0: # %bb 3742; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 3743; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] 3744; CHECK-NEXT: .p2align 4, 0x90 3745; CHECK-NEXT: .LBB107_1: # %bb1 3746; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3747; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm1 3748; CHECK-NEXT: vpcmpeqq %zmm0, %zmm1, %k1 3749; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} 3750; CHECK-NEXT: vmovdqu64 %zmm1, 8192(%rdi,%rax) 3751; CHECK-NEXT: addq $64, %rax 3752; CHECK-NEXT: jne .LBB107_1 3753; CHECK-NEXT: # %bb.2: # %bb10 3754; CHECK-NEXT: vzeroupper 3755; CHECK-NEXT: retq 3756bb: 3757 br label %bb1 3758 3759bb1: ; preds = %bb1, %bb 3760 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3761 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3762 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 3763 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4 3764 %tmp5 = icmp eq <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 3765 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4 3766 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 3767 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4 3768 %tmp8 = add i64 %tmp, 8 3769 %tmp9 = icmp eq i64 %tmp8, 1024 3770 br i1 %tmp9, label %bb10, label %bb1 3771 3772bb10: ; preds = %bb1 3773 ret void 3774} 3775 3776define void @bcast_unfold_pcmp_v4i32(i32* %arg) { 3777; CHECK-LABEL: bcast_unfold_pcmp_v4i32: 3778; CHECK: # %bb.0: # %bb 3779; CHECK-NEXT: xorl %eax, %eax 3780; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] 3781; CHECK-NEXT: .p2align 4, 0x90 3782; CHECK-NEXT: .LBB108_1: # %bb1 3783; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3784; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 3785; CHECK-NEXT: vpcmpltd %xmm0, %xmm1, %k1 3786; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} 3787; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) 3788; CHECK-NEXT: addq $4, %rax 3789; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 3790; CHECK-NEXT: jg .LBB108_1 3791; CHECK-NEXT: # %bb.2: # %bb10 3792; CHECK-NEXT: retq 3793bb: 3794 br label %bb1 3795 3796bb1: ; preds = %bb1, %bb 3797 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3798 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3799 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 3800 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 3801 %tmp5 = icmp slt <4 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1> 3802 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 3803 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 3804 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 3805 %tmp8 = add i64 %tmp, 4 3806 %tmp9 = icmp slt i64 %tmp8, 1024 3807 br i1 %tmp9, label %bb10, label %bb1 3808 3809bb10: ; preds = %bb1 3810 ret void 3811} 3812 3813define void @bcast_unfold_pcmp_v8i32(i32* %arg) { 3814; CHECK-LABEL: bcast_unfold_pcmp_v8i32: 3815; CHECK: # %bb.0: # %bb 3816; CHECK-NEXT: xorl %eax, %eax 3817; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] 3818; CHECK-NEXT: .p2align 4, 0x90 3819; CHECK-NEXT: .LBB109_1: # %bb1 3820; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3821; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 3822; CHECK-NEXT: vpcmpltd %ymm0, %ymm1, %k1 3823; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1} 3824; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) 3825; CHECK-NEXT: addq $8, %rax 3826; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 3827; CHECK-NEXT: jg .LBB109_1 3828; CHECK-NEXT: # %bb.2: # %bb10 3829; CHECK-NEXT: vzeroupper 3830; CHECK-NEXT: retq 3831bb: 3832 br label %bb1 3833 3834bb1: ; preds = %bb1, %bb 3835 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3836 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3837 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 3838 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 3839 %tmp5 = icmp slt <8 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3840 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4 3841 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 3842 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 3843 %tmp8 = add i64 %tmp, 8 3844 %tmp9 = icmp slt i64 %tmp8, 1024 3845 br i1 %tmp9, label %bb10, label %bb1 3846 3847bb10: ; preds = %bb1 3848 ret void 3849} 3850 3851define void @bcast_unfold_pcmp_v16i32(i32* %arg) { 3852; CHECK-LABEL: bcast_unfold_pcmp_v16i32: 3853; CHECK: # %bb.0: # %bb 3854; CHECK-NEXT: xorl %eax, %eax 3855; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3856; CHECK-NEXT: .p2align 4, 0x90 3857; CHECK-NEXT: .LBB110_1: # %bb1 3858; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3859; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 3860; CHECK-NEXT: vpcmpltd %zmm0, %zmm1, %k1 3861; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} 3862; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) 3863; CHECK-NEXT: addq $16, %rax 3864; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 3865; CHECK-NEXT: jg .LBB110_1 3866; CHECK-NEXT: # %bb.2: # %bb10 3867; CHECK-NEXT: vzeroupper 3868; CHECK-NEXT: retq 3869bb: 3870 br label %bb1 3871 3872bb1: ; preds = %bb1, %bb 3873 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3874 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 3875 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 3876 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 3877 %tmp5 = icmp slt <16 x i32> %tmp4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 3878 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4 3879 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 3880 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 3881 %tmp8 = add i64 %tmp, 16 3882 %tmp9 = icmp slt i64 %tmp8, 1024 3883 br i1 %tmp9, label %bb10, label %bb1 3884 3885bb10: ; preds = %bb1 3886 ret void 3887} 3888 3889define void @bcast_unfold_pcmp_v2i64(i64* %arg) { 3890; CHECK-LABEL: bcast_unfold_pcmp_v2i64: 3891; CHECK: # %bb.0: # %bb 3892; CHECK-NEXT: xorl %eax, %eax 3893; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1] 3894; CHECK-NEXT: .p2align 4, 0x90 3895; CHECK-NEXT: .LBB111_1: # %bb1 3896; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3897; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 3898; CHECK-NEXT: vpcmpltq %xmm0, %xmm1, %k1 3899; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1} 3900; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) 3901; CHECK-NEXT: addq $2, %rax 3902; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 3903; CHECK-NEXT: jg .LBB111_1 3904; CHECK-NEXT: # %bb.2: # %bb10 3905; CHECK-NEXT: retq 3906bb: 3907 br label %bb1 3908 3909bb1: ; preds = %bb1, %bb 3910 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3911 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3912 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 3913 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4 3914 %tmp5 = icmp slt <2 x i64> %tmp4, <i64 1, i64 1> 3915 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4 3916 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 3917 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4 3918 %tmp8 = add i64 %tmp, 2 3919 %tmp9 = icmp slt i64 %tmp8, 1024 3920 br i1 %tmp9, label %bb10, label %bb1 3921 3922bb10: ; preds = %bb1 3923 ret void 3924} 3925define void @bcast_unfold_pcmp_v4i64(i64* %arg) { 3926; CHECK-LABEL: bcast_unfold_pcmp_v4i64: 3927; CHECK: # %bb.0: # %bb 3928; CHECK-NEXT: xorl %eax, %eax 3929; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,1,1,1] 3930; CHECK-NEXT: .p2align 4, 0x90 3931; CHECK-NEXT: .LBB112_1: # %bb1 3932; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3933; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 3934; CHECK-NEXT: vpcmpltq %ymm0, %ymm1, %k1 3935; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} 3936; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) 3937; CHECK-NEXT: addq $4, %rax 3938; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 3939; CHECK-NEXT: jg .LBB112_1 3940; CHECK-NEXT: # %bb.2: # %bb10 3941; CHECK-NEXT: vzeroupper 3942; CHECK-NEXT: retq 3943bb: 3944 br label %bb1 3945 3946bb1: ; preds = %bb1, %bb 3947 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3948 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3949 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 3950 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4 3951 %tmp5 = icmp slt <4 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1> 3952 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 3953 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 3954 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4 3955 %tmp8 = add i64 %tmp, 4 3956 %tmp9 = icmp slt i64 %tmp8, 1024 3957 br i1 %tmp9, label %bb10, label %bb1 3958 3959bb10: ; preds = %bb1 3960 ret void 3961} 3962 3963define void @bcast_unfold_pcmp_v8i64(i64* %arg) { 3964; CHECK-LABEL: bcast_unfold_pcmp_v8i64: 3965; CHECK: # %bb.0: # %bb 3966; CHECK-NEXT: xorl %eax, %eax 3967; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1] 3968; CHECK-NEXT: .p2align 4, 0x90 3969; CHECK-NEXT: .LBB113_1: # %bb1 3970; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3971; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 3972; CHECK-NEXT: vpcmpltq %zmm0, %zmm1, %k1 3973; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} 3974; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) 3975; CHECK-NEXT: addq $8, %rax 3976; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 3977; CHECK-NEXT: jg .LBB113_1 3978; CHECK-NEXT: # %bb.2: # %bb10 3979; CHECK-NEXT: vzeroupper 3980; CHECK-NEXT: retq 3981bb: 3982 br label %bb1 3983 3984bb1: ; preds = %bb1, %bb 3985 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 3986 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 3987 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 3988 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4 3989 %tmp5 = icmp slt <8 x i64> %tmp4, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 3990 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4 3991 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 3992 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4 3993 %tmp8 = add i64 %tmp, 8 3994 %tmp9 = icmp slt i64 %tmp8, 1024 3995 br i1 %tmp9, label %bb10, label %bb1 3996 3997bb10: ; preds = %bb1 3998 ret void 3999} 4000 4001define void @bcast_unfold_pcmpu_v4i32(i32* %arg) { 4002; CHECK-LABEL: bcast_unfold_pcmpu_v4i32: 4003; CHECK: # %bb.0: # %bb 4004; CHECK-NEXT: xorl %eax, %eax 4005; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 4006; CHECK-NEXT: .p2align 4, 0x90 4007; CHECK-NEXT: .LBB114_1: # %bb1 4008; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4009; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %xmm1 4010; CHECK-NEXT: vpcmpltud %xmm0, %xmm1, %k1 4011; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} 4012; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,4) 4013; CHECK-NEXT: addq $4, %rax 4014; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 4015; CHECK-NEXT: ja .LBB114_1 4016; CHECK-NEXT: # %bb.2: # %bb10 4017; CHECK-NEXT: retq 4018bb: 4019 br label %bb1 4020 4021bb1: ; preds = %bb1, %bb 4022 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4023 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 4024 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 4025 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 4026 %tmp5 = icmp ult <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> 4027 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 4028 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 4029 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 4030 %tmp8 = add i64 %tmp, 4 4031 %tmp9 = icmp ult i64 %tmp8, 1024 4032 br i1 %tmp9, label %bb10, label %bb1 4033 4034bb10: ; preds = %bb1 4035 ret void 4036} 4037 4038define void @bcast_unfold_pcmpu_v8i32(i32* %arg) { 4039; CHECK-LABEL: bcast_unfold_pcmpu_v8i32: 4040; CHECK: # %bb.0: # %bb 4041; CHECK-NEXT: xorl %eax, %eax 4042; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] 4043; CHECK-NEXT: .p2align 4, 0x90 4044; CHECK-NEXT: .LBB115_1: # %bb1 4045; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4046; CHECK-NEXT: vmovdqu (%rdi,%rax,4), %ymm1 4047; CHECK-NEXT: vpcmpltud %ymm0, %ymm1, %k1 4048; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 {%k1} 4049; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,4) 4050; CHECK-NEXT: addq $8, %rax 4051; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 4052; CHECK-NEXT: ja .LBB115_1 4053; CHECK-NEXT: # %bb.2: # %bb10 4054; CHECK-NEXT: vzeroupper 4055; CHECK-NEXT: retq 4056bb: 4057 br label %bb1 4058 4059bb1: ; preds = %bb1, %bb 4060 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4061 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 4062 %tmp3 = bitcast i32* %tmp2 to <8 x i32>* 4063 %tmp4 = load <8 x i32>, <8 x i32>* %tmp3, align 4 4064 %tmp5 = icmp ult <8 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 4065 %tmp6 = select <8 x i1> %tmp5, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <8 x i32> %tmp4 4066 %tmp7 = bitcast i32* %tmp2 to <8 x i32>* 4067 store <8 x i32> %tmp6, <8 x i32>* %tmp7, align 4 4068 %tmp8 = add i64 %tmp, 8 4069 %tmp9 = icmp ult i64 %tmp8, 1024 4070 br i1 %tmp9, label %bb10, label %bb1 4071 4072bb10: ; preds = %bb1 4073 ret void 4074} 4075 4076define void @bcast_unfold_pcmpu_v16i32(i32* %arg) { 4077; CHECK-LABEL: bcast_unfold_pcmpu_v16i32: 4078; CHECK: # %bb.0: # %bb 4079; CHECK-NEXT: xorl %eax, %eax 4080; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 4081; CHECK-NEXT: .p2align 4, 0x90 4082; CHECK-NEXT: .LBB116_1: # %bb1 4083; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4084; CHECK-NEXT: vmovdqu64 (%rdi,%rax,4), %zmm1 4085; CHECK-NEXT: vpcmpltud %zmm0, %zmm1, %k1 4086; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 {%k1} 4087; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,4) 4088; CHECK-NEXT: addq $16, %rax 4089; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 4090; CHECK-NEXT: ja .LBB116_1 4091; CHECK-NEXT: # %bb.2: # %bb10 4092; CHECK-NEXT: vzeroupper 4093; CHECK-NEXT: retq 4094bb: 4095 br label %bb1 4096 4097bb1: ; preds = %bb1, %bb 4098 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4099 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 4100 %tmp3 = bitcast i32* %tmp2 to <16 x i32>* 4101 %tmp4 = load <16 x i32>, <16 x i32>* %tmp3, align 4 4102 %tmp5 = icmp ult <16 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 4103 %tmp6 = select <16 x i1> %tmp5, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>, <16 x i32> %tmp4 4104 %tmp7 = bitcast i32* %tmp2 to <16 x i32>* 4105 store <16 x i32> %tmp6, <16 x i32>* %tmp7, align 4 4106 %tmp8 = add i64 %tmp, 16 4107 %tmp9 = icmp ult i64 %tmp8, 1024 4108 br i1 %tmp9, label %bb10, label %bb1 4109 4110bb10: ; preds = %bb1 4111 ret void 4112} 4113 4114define void @bcast_unfold_pcmpu_v2i64(i64* %arg) { 4115; CHECK-LABEL: bcast_unfold_pcmpu_v2i64: 4116; CHECK: # %bb.0: # %bb 4117; CHECK-NEXT: xorl %eax, %eax 4118; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,2] 4119; CHECK-NEXT: .p2align 4, 0x90 4120; CHECK-NEXT: .LBB117_1: # %bb1 4121; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4122; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %xmm1 4123; CHECK-NEXT: vpcmpltuq %xmm0, %xmm1, %k1 4124; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm1 {%k1} 4125; CHECK-NEXT: vmovdqu %xmm1, (%rdi,%rax,8) 4126; CHECK-NEXT: addq $2, %rax 4127; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 4128; CHECK-NEXT: ja .LBB117_1 4129; CHECK-NEXT: # %bb.2: # %bb10 4130; CHECK-NEXT: retq 4131bb: 4132 br label %bb1 4133 4134bb1: ; preds = %bb1, %bb 4135 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4136 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 4137 %tmp3 = bitcast i64* %tmp2 to <2 x i64>* 4138 %tmp4 = load <2 x i64>, <2 x i64>* %tmp3, align 4 4139 %tmp5 = icmp ult <2 x i64> %tmp4, <i64 2, i64 2> 4140 %tmp6 = select <2 x i1> %tmp5, <2 x i64> <i64 3, i64 3>, <2 x i64> %tmp4 4141 %tmp7 = bitcast i64* %tmp2 to <2 x i64>* 4142 store <2 x i64> %tmp6, <2 x i64>* %tmp7, align 4 4143 %tmp8 = add i64 %tmp, 2 4144 %tmp9 = icmp ult i64 %tmp8, 1024 4145 br i1 %tmp9, label %bb10, label %bb1 4146 4147bb10: ; preds = %bb1 4148 ret void 4149} 4150define void @bcast_unfold_pcmpu_v4i64(i64* %arg) { 4151; CHECK-LABEL: bcast_unfold_pcmpu_v4i64: 4152; CHECK: # %bb.0: # %bb 4153; CHECK-NEXT: xorl %eax, %eax 4154; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 4155; CHECK-NEXT: .p2align 4, 0x90 4156; CHECK-NEXT: .LBB118_1: # %bb1 4157; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4158; CHECK-NEXT: vmovdqu (%rdi,%rax,8), %ymm1 4159; CHECK-NEXT: vpcmpltuq %ymm0, %ymm1, %k1 4160; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} 4161; CHECK-NEXT: vmovdqu %ymm1, (%rdi,%rax,8) 4162; CHECK-NEXT: addq $4, %rax 4163; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 4164; CHECK-NEXT: ja .LBB118_1 4165; CHECK-NEXT: # %bb.2: # %bb10 4166; CHECK-NEXT: vzeroupper 4167; CHECK-NEXT: retq 4168bb: 4169 br label %bb1 4170 4171bb1: ; preds = %bb1, %bb 4172 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4173 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 4174 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 4175 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 4 4176 %tmp5 = icmp ult <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> 4177 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 4178 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 4179 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 4 4180 %tmp8 = add i64 %tmp, 4 4181 %tmp9 = icmp ult i64 %tmp8, 1024 4182 br i1 %tmp9, label %bb10, label %bb1 4183 4184bb10: ; preds = %bb1 4185 ret void 4186} 4187 4188define void @bcast_unfold_pcmpu_v8i64(i64* %arg) { 4189; CHECK-LABEL: bcast_unfold_pcmpu_v8i64: 4190; CHECK: # %bb.0: # %bb 4191; CHECK-NEXT: xorl %eax, %eax 4192; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2] 4193; CHECK-NEXT: .p2align 4, 0x90 4194; CHECK-NEXT: .LBB119_1: # %bb1 4195; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4196; CHECK-NEXT: vmovdqu64 (%rdi,%rax,8), %zmm1 4197; CHECK-NEXT: vpcmpltuq %zmm0, %zmm1, %k1 4198; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1 {%k1} 4199; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi,%rax,8) 4200; CHECK-NEXT: addq $8, %rax 4201; CHECK-NEXT: cmpq $1023, %rax # imm = 0x3FF 4202; CHECK-NEXT: ja .LBB119_1 4203; CHECK-NEXT: # %bb.2: # %bb10 4204; CHECK-NEXT: vzeroupper 4205; CHECK-NEXT: retq 4206bb: 4207 br label %bb1 4208 4209bb1: ; preds = %bb1, %bb 4210 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4211 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 4212 %tmp3 = bitcast i64* %tmp2 to <8 x i64>* 4213 %tmp4 = load <8 x i64>, <8 x i64>* %tmp3, align 4 4214 %tmp5 = icmp ult <8 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> 4215 %tmp6 = select <8 x i1> %tmp5, <8 x i64> <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>, <8 x i64> %tmp4 4216 %tmp7 = bitcast i64* %tmp2 to <8 x i64>* 4217 store <8 x i64> %tmp6, <8 x i64>* %tmp7, align 4 4218 %tmp8 = add i64 %tmp, 8 4219 %tmp9 = icmp ult i64 %tmp8, 1024 4220 br i1 %tmp9, label %bb10, label %bb1 4221 4222bb10: ; preds = %bb1 4223 ret void 4224} 4225 4226define void @bcast_unfold_cmp_v4f32(float* %arg) { 4227; CHECK-LABEL: bcast_unfold_cmp_v4f32: 4228; CHECK: # %bb.0: # %bb 4229; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 4230; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 4231; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] 4232; CHECK-NEXT: .p2align 4, 0x90 4233; CHECK-NEXT: .LBB120_1: # %bb1 4234; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4235; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2 4236; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1 4237; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1} 4238; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax) 4239; CHECK-NEXT: addq $16, %rax 4240; CHECK-NEXT: jne .LBB120_1 4241; CHECK-NEXT: # %bb.2: # %bb10 4242; CHECK-NEXT: retq 4243bb: 4244 br label %bb1 4245 4246bb1: ; preds = %bb1, %bb 4247 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4248 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 4249 %tmp3 = bitcast float* %tmp2 to <4 x float>* 4250 %tmp4 = load <4 x float>, <4 x float>* %tmp3, align 4 4251 %tmp5 = fcmp olt <4 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 4252 %tmp6 = select <4 x i1> %tmp5, <4 x float> %tmp4, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> 4253 %tmp7 = bitcast float* %tmp2 to <4 x float>* 4254 store <4 x float> %tmp6, <4 x float>* %tmp7, align 4 4255 %tmp8 = add i64 %tmp, 4 4256 %tmp9 = icmp eq i64 %tmp8, 1024 4257 br i1 %tmp9, label %bb10, label %bb1 4258 4259bb10: ; preds = %bb1 4260 ret void 4261} 4262 4263define void @bcast_unfold_cmp_v8f32(float* %arg) { 4264; CHECK-LABEL: bcast_unfold_cmp_v8f32: 4265; CHECK: # %bb.0: # %bb 4266; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 4267; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 4268; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] 4269; CHECK-NEXT: .p2align 4, 0x90 4270; CHECK-NEXT: .LBB121_1: # %bb1 4271; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4272; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2 4273; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1 4274; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1} 4275; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax) 4276; CHECK-NEXT: addq $32, %rax 4277; CHECK-NEXT: jne .LBB121_1 4278; CHECK-NEXT: # %bb.2: # %bb10 4279; CHECK-NEXT: vzeroupper 4280; CHECK-NEXT: retq 4281bb: 4282 br label %bb1 4283 4284bb1: ; preds = %bb1, %bb 4285 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4286 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 4287 %tmp3 = bitcast float* %tmp2 to <8 x float>* 4288 %tmp4 = load <8 x float>, <8 x float>* %tmp3, align 4 4289 %tmp5 = fcmp olt <8 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 4290 %tmp6 = select <8 x i1> %tmp5, <8 x float> %tmp4, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> 4291 %tmp7 = bitcast float* %tmp2 to <8 x float>* 4292 store <8 x float> %tmp6, <8 x float>* %tmp7, align 4 4293 %tmp8 = add i64 %tmp, 8 4294 %tmp9 = icmp eq i64 %tmp8, 1024 4295 br i1 %tmp9, label %bb10, label %bb1 4296 4297bb10: ; preds = %bb1 4298 ret void 4299} 4300 4301define void @bcast_unfold_cmp_v16f32(float* %arg) { 4302; CHECK-LABEL: bcast_unfold_cmp_v16f32: 4303; CHECK: # %bb.0: # %bb 4304; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 4305; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 4306; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] 4307; CHECK-NEXT: .p2align 4, 0x90 4308; CHECK-NEXT: .LBB122_1: # %bb1 4309; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4310; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2 4311; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1 4312; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1} 4313; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax) 4314; CHECK-NEXT: addq $64, %rax 4315; CHECK-NEXT: jne .LBB122_1 4316; CHECK-NEXT: # %bb.2: # %bb10 4317; CHECK-NEXT: vzeroupper 4318; CHECK-NEXT: retq 4319bb: 4320 br label %bb1 4321 4322bb1: ; preds = %bb1, %bb 4323 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4324 %tmp2 = getelementptr inbounds float, float* %arg, i64 %tmp 4325 %tmp3 = bitcast float* %tmp2 to <16 x float>* 4326 %tmp4 = load <16 x float>, <16 x float>* %tmp3, align 4 4327 %tmp5 = fcmp olt <16 x float> %tmp4, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 4328 %tmp6 = select <16 x i1> %tmp5, <16 x float> %tmp4, <16 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> 4329 %tmp7 = bitcast float* %tmp2 to <16 x float>* 4330 store <16 x float> %tmp6, <16 x float>* %tmp7, align 4 4331 %tmp8 = add i64 %tmp, 16 4332 %tmp9 = icmp eq i64 %tmp8, 1024 4333 br i1 %tmp9, label %bb10, label %bb1 4334 4335bb10: ; preds = %bb1 4336 ret void 4337} 4338 4339define void @bcast_unfold_cmp_v2f64(double* %arg) { 4340; CHECK-LABEL: bcast_unfold_cmp_v2f64: 4341; CHECK: # %bb.0: # %bb 4342; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 4343; CHECK-NEXT: vmovapd {{.*#+}} xmm0 = [2.0E+0,2.0E+0] 4344; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [3.0E+0,3.0E+0] 4345; CHECK-NEXT: .p2align 4, 0x90 4346; CHECK-NEXT: .LBB123_1: # %bb1 4347; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4348; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm2 4349; CHECK-NEXT: vcmpltpd %xmm0, %xmm2, %k1 4350; CHECK-NEXT: vblendmpd %xmm2, %xmm1, %xmm2 {%k1} 4351; CHECK-NEXT: vmovupd %xmm2, 8192(%rdi,%rax) 4352; CHECK-NEXT: addq $16, %rax 4353; CHECK-NEXT: jne .LBB123_1 4354; CHECK-NEXT: # %bb.2: # %bb10 4355; CHECK-NEXT: retq 4356bb: 4357 br label %bb1 4358 4359bb1: ; preds = %bb1, %bb 4360 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4361 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 4362 %tmp3 = bitcast double* %tmp2 to <2 x double>* 4363 %tmp4 = load <2 x double>, <2 x double>* %tmp3, align 8 4364 %tmp5 = fcmp olt <2 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00> 4365 %tmp6 = select <2 x i1> %tmp5, <2 x double> %tmp4, <2 x double> <double 3.000000e+00, double 3.000000e+00> 4366 %tmp7 = bitcast double* %tmp2 to <2 x double>* 4367 store <2 x double> %tmp6, <2 x double>* %tmp7, align 8 4368 %tmp8 = add i64 %tmp, 2 4369 %tmp9 = icmp eq i64 %tmp8, 1024 4370 br i1 %tmp9, label %bb10, label %bb1 4371 4372bb10: ; preds = %bb1 4373 ret void 4374} 4375 4376define void @bcast_unfold_cmp_v4f64(double* %arg) { 4377; CHECK-LABEL: bcast_unfold_cmp_v4f64: 4378; CHECK: # %bb.0: # %bb 4379; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 4380; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] 4381; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] 4382; CHECK-NEXT: .p2align 4, 0x90 4383; CHECK-NEXT: .LBB124_1: # %bb1 4384; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4385; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2 4386; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1 4387; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1} 4388; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax) 4389; CHECK-NEXT: addq $32, %rax 4390; CHECK-NEXT: jne .LBB124_1 4391; CHECK-NEXT: # %bb.2: # %bb10 4392; CHECK-NEXT: vzeroupper 4393; CHECK-NEXT: retq 4394bb: 4395 br label %bb1 4396 4397bb1: ; preds = %bb1, %bb 4398 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4399 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 4400 %tmp3 = bitcast double* %tmp2 to <4 x double>* 4401 %tmp4 = load <4 x double>, <4 x double>* %tmp3, align 8 4402 %tmp5 = fcmp olt <4 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 4403 %tmp6 = select <4 x i1> %tmp5, <4 x double> %tmp4, <4 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00> 4404 %tmp7 = bitcast double* %tmp2 to <4 x double>* 4405 store <4 x double> %tmp6, <4 x double>* %tmp7, align 8 4406 %tmp8 = add i64 %tmp, 4 4407 %tmp9 = icmp eq i64 %tmp8, 1024 4408 br i1 %tmp9, label %bb10, label %bb1 4409 4410bb10: ; preds = %bb1 4411 ret void 4412} 4413 4414define void @bcast_unfold_cmp_v8f64(double* %arg) { 4415; CHECK-LABEL: bcast_unfold_cmp_v8f64: 4416; CHECK: # %bb.0: # %bb 4417; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 4418; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 4419; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] 4420; CHECK-NEXT: .p2align 4, 0x90 4421; CHECK-NEXT: .LBB125_1: # %bb1 4422; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4423; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2 4424; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1 4425; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1} 4426; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax) 4427; CHECK-NEXT: addq $64, %rax 4428; CHECK-NEXT: jne .LBB125_1 4429; CHECK-NEXT: # %bb.2: # %bb10 4430; CHECK-NEXT: vzeroupper 4431; CHECK-NEXT: retq 4432bb: 4433 br label %bb1 4434 4435bb1: ; preds = %bb1, %bb 4436 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4437 %tmp2 = getelementptr inbounds double, double* %arg, i64 %tmp 4438 %tmp3 = bitcast double* %tmp2 to <8 x double>* 4439 %tmp4 = load <8 x double>, <8 x double>* %tmp3, align 8 4440 %tmp5 = fcmp olt <8 x double> %tmp4, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00> 4441 %tmp6 = select <8 x i1> %tmp5, <8 x double> %tmp4, <8 x double> <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00> 4442 %tmp7 = bitcast double* %tmp2 to <8 x double>* 4443 store <8 x double> %tmp6, <8 x double>* %tmp7, align 8 4444 %tmp8 = add i64 %tmp, 8 4445 %tmp9 = icmp eq i64 %tmp8, 1024 4446 br i1 %tmp9, label %bb10, label %bb1 4447 4448bb10: ; preds = %bb1 4449 ret void 4450} 4451 4452define void @bcast_unfold_cmp_v8f32_refold(float* nocapture %0) { 4453; CHECK-LABEL: bcast_unfold_cmp_v8f32_refold: 4454; CHECK: # %bb.0: 4455; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 4456; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] 4457; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] 4458; CHECK-NEXT: .p2align 4, 0x90 4459; CHECK-NEXT: .LBB126_1: # =>This Inner Loop Header: Depth=1 4460; CHECK-NEXT: vcmpgtps 4096(%rdi,%rax), %ymm0, %k1 4461; CHECK-NEXT: vblendmps {{.*}}(%rip){1to8}, %ymm1, %ymm2 {%k1} 4462; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax) 4463; CHECK-NEXT: addq $32, %rax 4464; CHECK-NEXT: jne .LBB126_1 4465; CHECK-NEXT: # %bb.2: 4466; CHECK-NEXT: vzeroupper 4467; CHECK-NEXT: retq 4468 br label %2 4469 44702: ; preds = %2, %1 4471 %3 = phi i64 [ 0, %1 ], [ %10, %2 ] 4472 %4 = getelementptr inbounds float, float* %0, i64 %3 4473 %5 = bitcast float* %4 to <8 x float>* 4474 %6 = load <8 x float>, <8 x float>* %5, align 4 4475 %7 = fcmp olt <8 x float> %6, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> 4476 %8 = select <8 x i1> %7, <8 x float> <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>, <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00> 4477 %9 = bitcast float* %4 to <8 x float>* 4478 store <8 x float> %8, <8 x float>* %9, align 4 4479 %10 = add i64 %3, 8 4480 %11 = icmp eq i64 %10, 1024 4481 br i1 %11, label %12, label %2 4482 448312: ; preds = %2 4484 ret void 4485} 4486 4487define void @bcast_unfold_ptestm_v4i32(i32* %arg) { 4488; CHECK-LABEL: bcast_unfold_ptestm_v4i32: 4489; CHECK: # %bb.0: # %bb 4490; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 4491; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 4492; CHECK-NEXT: .p2align 4, 0x90 4493; CHECK-NEXT: .LBB127_1: # %bb1 4494; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4495; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 4496; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k1 4497; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} 4498; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 4499; CHECK-NEXT: addq $16, %rax 4500; CHECK-NEXT: jne .LBB127_1 4501; CHECK-NEXT: # %bb.2: # %bb10 4502; CHECK-NEXT: retq 4503bb: 4504 br label %bb1 4505 4506bb1: ; preds = %bb1, %bb 4507 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4508 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 4509 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 4510 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 4511 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> 4512 %tmp5 = icmp ne <4 x i32> %tmp4b, zeroinitializer 4513 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 4514 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 4515 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 4516 %tmp8 = add i64 %tmp, 4 4517 %tmp9 = icmp eq i64 %tmp8, 1024 4518 br i1 %tmp9, label %bb10, label %bb1 4519 4520bb10: ; preds = %bb1 4521 ret void 4522} 4523 4524define void @bcast_unfold_ptestnm_v4i32(i32* %arg) { 4525; CHECK-LABEL: bcast_unfold_ptestnm_v4i32: 4526; CHECK: # %bb.0: # %bb 4527; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 4528; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] 4529; CHECK-NEXT: .p2align 4, 0x90 4530; CHECK-NEXT: .LBB128_1: # %bb1 4531; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4532; CHECK-NEXT: vmovdqu 4096(%rdi,%rax), %xmm1 4533; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k1 4534; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 {%k1} 4535; CHECK-NEXT: vmovdqu %xmm1, 4096(%rdi,%rax) 4536; CHECK-NEXT: addq $16, %rax 4537; CHECK-NEXT: jne .LBB128_1 4538; CHECK-NEXT: # %bb.2: # %bb10 4539; CHECK-NEXT: retq 4540bb: 4541 br label %bb1 4542 4543bb1: ; preds = %bb1, %bb 4544 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4545 %tmp2 = getelementptr inbounds i32, i32* %arg, i64 %tmp 4546 %tmp3 = bitcast i32* %tmp2 to <4 x i32>* 4547 %tmp4 = load <4 x i32>, <4 x i32>* %tmp3, align 4 4548 %tmp4b = and <4 x i32> %tmp4, <i32 2, i32 2, i32 2, i32 2> 4549 %tmp5 = icmp eq <4 x i32> %tmp4b, zeroinitializer 4550 %tmp6 = select <4 x i1> %tmp5, <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> %tmp4 4551 %tmp7 = bitcast i32* %tmp2 to <4 x i32>* 4552 store <4 x i32> %tmp6, <4 x i32>* %tmp7, align 4 4553 %tmp8 = add i64 %tmp, 4 4554 %tmp9 = icmp eq i64 %tmp8, 1024 4555 br i1 %tmp9, label %bb10, label %bb1 4556 4557bb10: ; preds = %bb1 4558 ret void 4559} 4560 4561define void @bcast_unfold_ptestm_v4i64(i64* %arg) { 4562; CHECK-LABEL: bcast_unfold_ptestm_v4i64: 4563; CHECK: # %bb.0: # %bb 4564; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 4565; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 4566; CHECK-NEXT: .p2align 4, 0x90 4567; CHECK-NEXT: .LBB129_1: # %bb1 4568; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4569; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 4570; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k1 4571; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} 4572; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 4573; CHECK-NEXT: addq $32, %rax 4574; CHECK-NEXT: jne .LBB129_1 4575; CHECK-NEXT: # %bb.2: # %bb10 4576; CHECK-NEXT: vzeroupper 4577; CHECK-NEXT: retq 4578bb: 4579 br label %bb1 4580 4581bb1: ; preds = %bb1, %bb 4582 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4583 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 4584 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 4585 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 4586 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> 4587 %tmp5 = icmp ne <4 x i64> %tmp4b, zeroinitializer 4588 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 4589 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 4590 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 4591 %tmp8 = add i64 %tmp, 4 4592 %tmp9 = icmp eq i64 %tmp8, 1024 4593 br i1 %tmp9, label %bb10, label %bb1 4594 4595bb10: ; preds = %bb1 4596 ret void 4597} 4598 4599define void @bcast_unfold_ptestnm_v4i64(i64* %arg) { 4600; CHECK-LABEL: bcast_unfold_ptestnm_v4i64: 4601; CHECK: # %bb.0: # %bb 4602; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 4603; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm0 = [2,2,2,2] 4604; CHECK-NEXT: .p2align 4, 0x90 4605; CHECK-NEXT: .LBB130_1: # %bb1 4606; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4607; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm1 4608; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k1 4609; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 {%k1} 4610; CHECK-NEXT: vmovdqu %ymm1, 8192(%rdi,%rax) 4611; CHECK-NEXT: addq $32, %rax 4612; CHECK-NEXT: jne .LBB130_1 4613; CHECK-NEXT: # %bb.2: # %bb10 4614; CHECK-NEXT: vzeroupper 4615; CHECK-NEXT: retq 4616bb: 4617 br label %bb1 4618 4619bb1: ; preds = %bb1, %bb 4620 %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb1 ] 4621 %tmp2 = getelementptr inbounds i64, i64* %arg, i64 %tmp 4622 %tmp3 = bitcast i64* %tmp2 to <4 x i64>* 4623 %tmp4 = load <4 x i64>, <4 x i64>* %tmp3, align 8 4624 %tmp4b = and <4 x i64> %tmp4, <i64 2, i64 2, i64 2, i64 2> 4625 %tmp5 = icmp eq <4 x i64> %tmp4b, zeroinitializer 4626 %tmp6 = select <4 x i1> %tmp5, <4 x i64> <i64 3, i64 3, i64 3, i64 3>, <4 x i64> %tmp4 4627 %tmp7 = bitcast i64* %tmp2 to <4 x i64>* 4628 store <4 x i64> %tmp6, <4 x i64>* %tmp7, align 8 4629 %tmp8 = add i64 %tmp, 4 4630 %tmp9 = icmp eq i64 %tmp8, 1024 4631 br i1 %tmp9, label %bb10, label %bb1 4632 4633bb10: ; preds = %bb1 4634 ret void 4635} 4636 4637; The or/and pattern here should be turned into vpternlog. The multiply is 4638; there to increase the use count of the loads so they can't fold. We want to 4639; unfold the broadcast and pull it out of the loop. 4640define void @bcast_unfold_vpternlog_v16i32(i32* %arg, i32* %arg1) { 4641; CHECK-LABEL: bcast_unfold_vpternlog_v16i32: 4642; CHECK: # %bb.0: # %bb 4643; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 4644; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] 4645; CHECK-NEXT: .p2align 4, 0x90 4646; CHECK-NEXT: .LBB131_1: # %bb2 4647; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 4648; CHECK-NEXT: vmovdqu64 4096(%rdi,%rax), %zmm1 4649; CHECK-NEXT: vmovdqu64 4096(%rsi,%rax), %zmm2 4650; CHECK-NEXT: vpmulld %zmm2, %zmm1, %zmm3 4651; CHECK-NEXT: vpternlogd $216, %zmm0, %zmm1, %zmm2 4652; CHECK-NEXT: vpmulld %zmm3, %zmm2, %zmm1 4653; CHECK-NEXT: vmovdqu64 %zmm1, 4096(%rdi,%rax) 4654; CHECK-NEXT: addq $64, %rax 4655; CHECK-NEXT: jne .LBB131_1 4656; CHECK-NEXT: # %bb.2: # %bb20 4657; CHECK-NEXT: vzeroupper 4658; CHECK-NEXT: retq 4659bb: 4660 br label %bb2 4661 4662bb2: ; preds = %bb2, %bb 4663 %tmp = phi i64 [ 0, %bb ], [ %tmp18, %bb2 ] 4664 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp 4665 %tmp4 = bitcast i32* %tmp3 to <16 x i32>* 4666 %tmp5 = load <16 x i32>, <16 x i32>* %tmp4, align 4 4667 %tmp6 = getelementptr inbounds i32, i32* %arg1, i64 %tmp 4668 %tmp10 = bitcast i32* %tmp6 to <16 x i32>* 4669 %tmp11 = load <16 x i32>, <16 x i32>* %tmp10, align 4 4670 %tmp12 = and <16 x i32> %tmp5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 4671 %tmp13 = and <16 x i32> %tmp11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 4672 %tmp14 = or <16 x i32> %tmp12, %tmp13 4673 %tmp15 = mul <16 x i32> %tmp14, %tmp5 4674 %tmp16 = mul <16 x i32> %tmp15, %tmp11 4675 %tmp17 = bitcast i32* %tmp3 to <16 x i32>* 4676 store <16 x i32> %tmp16, <16 x i32>* %tmp17, align 4 4677 %tmp18 = add i64 %tmp, 16 4678 %tmp19 = icmp eq i64 %tmp18, 1024 4679 br i1 %tmp19, label %bb20, label %bb2 4680 4681bb20: ; preds = %bb2 4682 ret void 4683} 4684 4685