1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 4 5define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp { 6; X32-LABEL: A: 7; X32: ## %bb.0: ## %entry 8; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 9; X32-NEXT: movl (%eax), %ecx 10; X32-NEXT: movl 4(%eax), %eax 11; X32-NEXT: vmovd %ecx, %xmm0 12; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 13; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 14; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 15; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 16; X32-NEXT: retl 17; 18; X64-LABEL: A: 19; X64: ## %bb.0: ## %entry 20; X64-NEXT: vbroadcastsd (%rdi), %ymm0 21; X64-NEXT: retq 22entry: 23 %q = load i64, i64* %ptr, align 8 24 %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 25 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1 26 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2 27 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3 28 ret <4 x i64> %vecinit6.i 29} 30 31define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp { 32; X32-LABEL: A2: 33; X32: ## %bb.0: ## %entry 34; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 35; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 36; X32-NEXT: movl (%ecx), %edx 37; X32-NEXT: movl 4(%ecx), %ecx 38; X32-NEXT: movl %ecx, 4(%eax) 39; X32-NEXT: movl %edx, (%eax) 40; X32-NEXT: vmovd %edx, %xmm0 41; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 42; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 43; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 44; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 45; X32-NEXT: retl 46; 47; X64-LABEL: A2: 48; X64: ## %bb.0: ## %entry 49; X64-NEXT: movq (%rdi), %rax 50; X64-NEXT: vmovq %rax, %xmm0 51; X64-NEXT: movq %rax, (%rsi) 52; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 53; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 54; X64-NEXT: retq 55entry: 56 %q = load i64, i64* %ptr, align 8 57 store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast 58 %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0 59 %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1 60 %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2 61 %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3 62 ret <4 x i64> %vecinit6.i 63} 64 65define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp { 66; X32-LABEL: B: 67; X32: ## %bb.0: ## %entry 68; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 69; X32-NEXT: vbroadcastss (%eax), %ymm0 70; X32-NEXT: retl 71; 72; X64-LABEL: B: 73; X64: ## %bb.0: ## %entry 74; X64-NEXT: vbroadcastss (%rdi), %ymm0 75; X64-NEXT: retq 76entry: 77 %q = load i32, i32* %ptr, align 4 78 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 79 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1 80 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2 81 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3 82 ret <8 x i32> %vecinit6.i 83} 84 85define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp { 86; X32-LABEL: B2: 87; X32: ## %bb.0: ## %entry 88; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 89; X32-NEXT: vbroadcastss (%eax), %ymm0 90; X32-NEXT: retl 91; 92; X64-LABEL: B2: 93; X64: ## %bb.0: ## %entry 94; X64-NEXT: vbroadcastss (%rdi), %ymm0 95; X64-NEXT: retq 96entry: 97 %q = load i32, i32* %ptr, align 4 98 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 99 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1 100 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2 101 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3 102 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4 103 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5 104 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6 105 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7 106 ret <8 x i32> %vecinit14.i 107} 108 109define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp { 110; X32-LABEL: B3: 111; X32: ## %bb.0: ## %entry 112; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 113; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 114; X32-NEXT: movl (%ecx), %ecx 115; X32-NEXT: vmovd %ecx, %xmm0 116; X32-NEXT: movl %ecx, (%eax) 117; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 118; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 119; X32-NEXT: retl 120; 121; X64-LABEL: B3: 122; X64: ## %bb.0: ## %entry 123; X64-NEXT: movl (%rdi), %eax 124; X64-NEXT: vmovd %eax, %xmm0 125; X64-NEXT: movl %eax, (%rsi) 126; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 127; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 128; X64-NEXT: retq 129entry: 130 %q = load i32, i32* %ptr, align 4 131 store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast 132 %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0 133 %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1 134 %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2 135 %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3 136 %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4 137 %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5 138 %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6 139 %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7 140 ret <8 x i32> %vecinit14.i 141} 142 143define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp { 144; X32-LABEL: C: 145; X32: ## %bb.0: ## %entry 146; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 147; X32-NEXT: vbroadcastsd (%eax), %ymm0 148; X32-NEXT: retl 149; 150; X64-LABEL: C: 151; X64: ## %bb.0: ## %entry 152; X64-NEXT: vbroadcastsd (%rdi), %ymm0 153; X64-NEXT: retq 154entry: 155 %q = load double, double* %ptr, align 8 156 %vecinit.i = insertelement <4 x double> undef, double %q, i32 0 157 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1 158 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2 159 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3 160 ret <4 x double> %vecinit6.i 161} 162 163define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp { 164; X32-LABEL: C2: 165; X32: ## %bb.0: ## %entry 166; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 167; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 168; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 169; X32-NEXT: vmovsd %xmm0, (%eax) 170; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 171; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 172; X32-NEXT: retl 173; 174; X64-LABEL: C2: 175; X64: ## %bb.0: ## %entry 176; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 177; X64-NEXT: vmovsd %xmm0, (%rsi) 178; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 179; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 180; X64-NEXT: retq 181entry: 182 %q = load double, double* %ptr, align 8 183 store double %q, double* %ptr2, align 8 ; to create a chain to prevent broadcast 184 %vecinit.i = insertelement <4 x double> undef, double %q, i32 0 185 %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1 186 %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2 187 %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3 188 ret <4 x double> %vecinit6.i 189} 190 191define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp { 192; X32-LABEL: D: 193; X32: ## %bb.0: ## %entry 194; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 195; X32-NEXT: vbroadcastss (%eax), %ymm0 196; X32-NEXT: retl 197; 198; X64-LABEL: D: 199; X64: ## %bb.0: ## %entry 200; X64-NEXT: vbroadcastss (%rdi), %ymm0 201; X64-NEXT: retq 202entry: 203 %q = load float, float* %ptr, align 4 204 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 205 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1 206 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2 207 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3 208 ret <8 x float> %vecinit6.i 209} 210 211define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp { 212; X32-LABEL: D2: 213; X32: ## %bb.0: ## %entry 214; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 215; X32-NEXT: vbroadcastss (%eax), %ymm0 216; X32-NEXT: retl 217; 218; X64-LABEL: D2: 219; X64: ## %bb.0: ## %entry 220; X64-NEXT: vbroadcastss (%rdi), %ymm0 221; X64-NEXT: retq 222entry: 223 %q = load float, float* %ptr, align 4 224 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 225 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1 226 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2 227 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3 228 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4 229 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5 230 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6 231 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7 232 ret <8 x float> %vecinit14.i 233} 234 235define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp { 236; X32-LABEL: D3: 237; X32: ## %bb.0: ## %entry 238; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 239; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 240; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 241; X32-NEXT: vmovss %xmm0, (%eax) 242; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 243; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 244; X32-NEXT: retl 245; 246; X64-LABEL: D3: 247; X64: ## %bb.0: ## %entry 248; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 249; X64-NEXT: vmovss %xmm0, (%rsi) 250; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 251; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 252; X64-NEXT: retq 253entry: 254 %q = load float, float* %ptr, align 4 255 store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast 256 %vecinit.i = insertelement <8 x float> undef, float %q, i32 0 257 %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1 258 %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2 259 %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3 260 %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4 261 %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5 262 %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6 263 %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7 264 ret <8 x float> %vecinit14.i 265} 266 267;;;; 128-bit versions 268 269define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp { 270; X32-LABEL: e: 271; X32: ## %bb.0: ## %entry 272; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 273; X32-NEXT: vbroadcastss (%eax), %xmm0 274; X32-NEXT: retl 275; 276; X64-LABEL: e: 277; X64: ## %bb.0: ## %entry 278; X64-NEXT: vbroadcastss (%rdi), %xmm0 279; X64-NEXT: retq 280entry: 281 %q = load float, float* %ptr, align 4 282 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 283 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 284 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 285 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 286 ret <4 x float> %vecinit6.i 287} 288 289define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp { 290; X32-LABEL: e2: 291; X32: ## %bb.0: ## %entry 292; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 293; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 294; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 295; X32-NEXT: vmovss %xmm0, (%eax) 296; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 297; X32-NEXT: retl 298; 299; X64-LABEL: e2: 300; X64: ## %bb.0: ## %entry 301; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 302; X64-NEXT: vmovss %xmm0, (%rsi) 303; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 304; X64-NEXT: retq 305entry: 306 %q = load float, float* %ptr, align 4 307 store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast 308 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 309 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 310 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 311 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 312 ret <4 x float> %vecinit6.i 313} 314 315; Don't broadcast constants on pre-AVX2 hardware. 316define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp { 317; X32-LABEL: _e2: 318; X32: ## %bb.0: ## %entry 319; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03] 320; X32-NEXT: retl 321; 322; X64-LABEL: _e2: 323; X64: ## %bb.0: ## %entry 324; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03] 325; X64-NEXT: retq 326entry: 327 %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 328 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0xbf80000000000000, i32 1 329 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2 330 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3 331 ret <4 x float> %vecinit6.i 332} 333 334 335define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp { 336; X32-LABEL: F: 337; X32: ## %bb.0: ## %entry 338; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 339; X32-NEXT: vbroadcastss (%eax), %xmm0 340; X32-NEXT: retl 341; 342; X64-LABEL: F: 343; X64: ## %bb.0: ## %entry 344; X64-NEXT: vbroadcastss (%rdi), %xmm0 345; X64-NEXT: retq 346entry: 347 %q = load i32, i32* %ptr, align 4 348 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 349 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1 350 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2 351 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3 352 ret <4 x i32> %vecinit6.i 353} 354 355define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp { 356; X32-LABEL: F2: 357; X32: ## %bb.0: ## %entry 358; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 359; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 360; X32-NEXT: movl (%ecx), %ecx 361; X32-NEXT: movl %ecx, (%eax) 362; X32-NEXT: vmovd %ecx, %xmm0 363; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 364; X32-NEXT: retl 365; 366; X64-LABEL: F2: 367; X64: ## %bb.0: ## %entry 368; X64-NEXT: movl (%rdi), %eax 369; X64-NEXT: movl %eax, (%rsi) 370; X64-NEXT: vmovd %eax, %xmm0 371; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 372; X64-NEXT: retq 373entry: 374 %q = load i32, i32* %ptr, align 4 375 store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast 376 %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0 377 %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1 378 %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2 379 %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3 380 ret <4 x i32> %vecinit6.i 381} 382 383; FIXME: Pointer adjusted broadcasts 384 385define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp { 386; X32-LABEL: load_splat_4i32_4i32_1111: 387; X32: ## %bb.0: ## %entry 388; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 389; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] 390; X32-NEXT: retl 391; 392; X64-LABEL: load_splat_4i32_4i32_1111: 393; X64: ## %bb.0: ## %entry 394; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] 395; X64-NEXT: retq 396entry: 397 %ld = load <4 x i32>, <4 x i32>* %ptr 398 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 399 ret <4 x i32> %ret 400} 401 402define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp { 403; X32-LABEL: load_splat_8i32_4i32_33333333: 404; X32: ## %bb.0: ## %entry 405; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 406; X32-NEXT: vbroadcastss 12(%eax), %ymm0 407; X32-NEXT: retl 408; 409; X64-LABEL: load_splat_8i32_4i32_33333333: 410; X64: ## %bb.0: ## %entry 411; X64-NEXT: vbroadcastss 12(%rdi), %ymm0 412; X64-NEXT: retq 413entry: 414 %ld = load <4 x i32>, <4 x i32>* %ptr 415 %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 416 ret <8 x i32> %ret 417} 418 419define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp { 420; X32-LABEL: load_splat_8i32_8i32_55555555: 421; X32: ## %bb.0: ## %entry 422; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 423; X32-NEXT: vbroadcastss 20(%eax), %ymm0 424; X32-NEXT: retl 425; 426; X64-LABEL: load_splat_8i32_8i32_55555555: 427; X64: ## %bb.0: ## %entry 428; X64-NEXT: vbroadcastss 20(%rdi), %ymm0 429; X64-NEXT: retq 430entry: 431 %ld = load <8 x i32>, <8 x i32>* %ptr 432 %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 433 ret <8 x i32> %ret 434} 435 436define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp { 437; X32-LABEL: load_splat_4f32_4f32_1111: 438; X32: ## %bb.0: ## %entry 439; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 440; X32-NEXT: vbroadcastss 4(%eax), %xmm0 441; X32-NEXT: retl 442; 443; X64-LABEL: load_splat_4f32_4f32_1111: 444; X64: ## %bb.0: ## %entry 445; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 446; X64-NEXT: retq 447entry: 448 %ld = load <4 x float>, <4 x float>* %ptr 449 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 450 ret <4 x float> %ret 451} 452 453define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp { 454; X32-LABEL: load_splat_8f32_4f32_33333333: 455; X32: ## %bb.0: ## %entry 456; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 457; X32-NEXT: vbroadcastss 12(%eax), %ymm0 458; X32-NEXT: retl 459; 460; X64-LABEL: load_splat_8f32_4f32_33333333: 461; X64: ## %bb.0: ## %entry 462; X64-NEXT: vbroadcastss 12(%rdi), %ymm0 463; X64-NEXT: retq 464entry: 465 %ld = load <4 x float>, <4 x float>* %ptr 466 %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 467 ret <8 x float> %ret 468} 469 470define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp { 471; X32-LABEL: load_splat_8f32_8f32_55555555: 472; X32: ## %bb.0: ## %entry 473; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 474; X32-NEXT: vbroadcastss 20(%eax), %ymm0 475; X32-NEXT: retl 476; 477; X64-LABEL: load_splat_8f32_8f32_55555555: 478; X64: ## %bb.0: ## %entry 479; X64-NEXT: vbroadcastss 20(%rdi), %ymm0 480; X64-NEXT: retq 481entry: 482 %ld = load <8 x float>, <8 x float>* %ptr 483 %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 484 ret <8 x float> %ret 485} 486 487define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp { 488; X32-LABEL: load_splat_2i64_2i64_1111: 489; X32: ## %bb.0: ## %entry 490; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 491; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] 492; X32-NEXT: retl 493; 494; X64-LABEL: load_splat_2i64_2i64_1111: 495; X64: ## %bb.0: ## %entry 496; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] 497; X64-NEXT: retq 498entry: 499 %ld = load <2 x i64>, <2 x i64>* %ptr 500 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 501 ret <2 x i64> %ret 502} 503 504define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp { 505; X32-LABEL: load_splat_4i64_2i64_1111: 506; X32: ## %bb.0: ## %entry 507; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 508; X32-NEXT: vbroadcastsd 8(%eax), %ymm0 509; X32-NEXT: retl 510; 511; X64-LABEL: load_splat_4i64_2i64_1111: 512; X64: ## %bb.0: ## %entry 513; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0 514; X64-NEXT: retq 515entry: 516 %ld = load <2 x i64>, <2 x i64>* %ptr 517 %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 518 ret <4 x i64> %ret 519} 520 521define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp { 522; X32-LABEL: load_splat_4i64_4i64_2222: 523; X32: ## %bb.0: ## %entry 524; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 525; X32-NEXT: vbroadcastsd 16(%eax), %ymm0 526; X32-NEXT: retl 527; 528; X64-LABEL: load_splat_4i64_4i64_2222: 529; X64: ## %bb.0: ## %entry 530; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0 531; X64-NEXT: retq 532entry: 533 %ld = load <4 x i64>, <4 x i64>* %ptr 534 %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 535 ret <4 x i64> %ret 536} 537 538define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp { 539; X32-LABEL: load_splat_2f64_2f64_1111: 540; X32: ## %bb.0: ## %entry 541; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 542; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 543; X32-NEXT: retl 544; 545; X64-LABEL: load_splat_2f64_2f64_1111: 546; X64: ## %bb.0: ## %entry 547; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 548; X64-NEXT: retq 549entry: 550 %ld = load <2 x double>, <2 x double>* %ptr 551 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1> 552 ret <2 x double> %ret 553} 554 555define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp { 556; X32-LABEL: load_splat_4f64_2f64_1111: 557; X32: ## %bb.0: ## %entry 558; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 559; X32-NEXT: vbroadcastsd 8(%eax), %ymm0 560; X32-NEXT: retl 561; 562; X64-LABEL: load_splat_4f64_2f64_1111: 563; X64: ## %bb.0: ## %entry 564; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0 565; X64-NEXT: retq 566entry: 567 %ld = load <2 x double>, <2 x double>* %ptr 568 %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 569 ret <4 x double> %ret 570} 571 572define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp { 573; X32-LABEL: load_splat_4f64_4f64_2222: 574; X32: ## %bb.0: ## %entry 575; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 576; X32-NEXT: vbroadcastsd 16(%eax), %ymm0 577; X32-NEXT: retl 578; 579; X64-LABEL: load_splat_4f64_4f64_2222: 580; X64: ## %bb.0: ## %entry 581; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0 582; X64-NEXT: retq 583entry: 584 %ld = load <4 x double>, <4 x double>* %ptr 585 %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2> 586 ret <4 x double> %ret 587} 588 589; Unsupported vbroadcasts 590 591define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp { 592; X32-LABEL: G: 593; X32: ## %bb.0: ## %entry 594; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 595; X32-NEXT: movl (%eax), %ecx 596; X32-NEXT: movl 4(%eax), %eax 597; X32-NEXT: vmovd %ecx, %xmm0 598; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 599; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 600; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 601; X32-NEXT: retl 602; 603; X64-LABEL: G: 604; X64: ## %bb.0: ## %entry 605; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 606; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 607; X64-NEXT: retq 608entry: 609 %q = load i64, i64* %ptr, align 8 610 %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0 611 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1 612 ret <2 x i64> %vecinit2.i 613} 614 615define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp { 616; X32-LABEL: G2: 617; X32: ## %bb.0: ## %entry 618; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 619; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 620; X32-NEXT: movl (%ecx), %edx 621; X32-NEXT: movl 4(%ecx), %ecx 622; X32-NEXT: movl %ecx, 4(%eax) 623; X32-NEXT: movl %edx, (%eax) 624; X32-NEXT: vmovd %edx, %xmm0 625; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 626; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 627; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 628; X32-NEXT: retl 629; 630; X64-LABEL: G2: 631; X64: ## %bb.0: ## %entry 632; X64-NEXT: movq (%rdi), %rax 633; X64-NEXT: movq %rax, (%rsi) 634; X64-NEXT: vmovq %rax, %xmm0 635; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 636; X64-NEXT: retq 637entry: 638 %q = load i64, i64* %ptr, align 8 639 store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast 640 %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0 641 %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1 642 ret <2 x i64> %vecinit2.i 643} 644 645define <4 x i32> @H(<4 x i32> %a) { 646; X32-LABEL: H: 647; X32: ## %bb.0: ## %entry 648; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] 649; X32-NEXT: retl 650; 651; X64-LABEL: H: 652; X64: ## %bb.0: ## %entry 653; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3] 654; X64-NEXT: retq 655entry: 656 %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 657 ret <4 x i32> %x 658} 659 660define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp { 661; X32-LABEL: I: 662; X32: ## %bb.0: ## %entry 663; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 664; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 665; X32-NEXT: retl 666; 667; X64-LABEL: I: 668; X64: ## %bb.0: ## %entry 669; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 670; X64-NEXT: retq 671entry: 672 %q = load double, double* %ptr, align 4 673 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0 674 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1 675 ret <2 x double> %vecinit2.i 676} 677 678define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp { 679; X32-LABEL: I2: 680; X32: ## %bb.0: ## %entry 681; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 682; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 683; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 684; X32-NEXT: vmovsd %xmm0, (%eax) 685; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 686; X32-NEXT: retl 687; 688; X64-LABEL: I2: 689; X64: ## %bb.0: ## %entry 690; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 691; X64-NEXT: vmovsd %xmm0, (%rsi) 692; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 693; X64-NEXT: retq 694entry: 695 %q = load double, double* %ptr, align 4 696 store double %q, double* %ptr2, align 4 ; to create a chain to prevent broadcast 697 %vecinit.i = insertelement <2 x double> undef, double %q, i32 0 698 %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1 699 ret <2 x double> %vecinit2.i 700} 701 702define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp { 703; X32-LABEL: _RR: 704; X32: ## %bb.0: ## %entry 705; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 706; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 707; X32-NEXT: vbroadcastss (%ecx), %xmm0 708; X32-NEXT: movl (%eax), %eax 709; X32-NEXT: movl %eax, (%eax) 710; X32-NEXT: retl 711; 712; X64-LABEL: _RR: 713; X64: ## %bb.0: ## %entry 714; X64-NEXT: vbroadcastss (%rdi), %xmm0 715; X64-NEXT: movl (%rsi), %eax 716; X64-NEXT: movl %eax, (%rax) 717; X64-NEXT: retq 718entry: 719 %q = load float, float* %ptr, align 4 720 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 721 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 722 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 723 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 724 ; force a chain 725 %j = load i32, i32* %k, align 4 726 store i32 %j, i32* undef 727 ret <4 x float> %vecinit6.i 728} 729 730define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp { 731; X32-LABEL: _RR2: 732; X32: ## %bb.0: ## %entry 733; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 734; X32-NEXT: vbroadcastss (%eax), %xmm0 735; X32-NEXT: retl 736; 737; X64-LABEL: _RR2: 738; X64: ## %bb.0: ## %entry 739; X64-NEXT: vbroadcastss (%rdi), %xmm0 740; X64-NEXT: retq 741entry: 742 %q = load float, float* %ptr, align 4 743 %v = insertelement <4 x float> undef, float %q, i32 0 744 %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer 745 ret <4 x float> %t 746} 747 748; These tests check that a vbroadcast instruction is used when we have a splat 749; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs 750; (via the insertelements). 751 752define <8 x float> @splat_concat1(float* %p) { 753; X32-LABEL: splat_concat1: 754; X32: ## %bb.0: 755; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 756; X32-NEXT: vbroadcastss (%eax), %ymm0 757; X32-NEXT: retl 758; 759; X64-LABEL: splat_concat1: 760; X64: ## %bb.0: 761; X64-NEXT: vbroadcastss (%rdi), %ymm0 762; X64-NEXT: retq 763 %1 = load float, float* %p, align 4 764 %2 = insertelement <4 x float> undef, float %1, i32 0 765 %3 = insertelement <4 x float> %2, float %1, i32 1 766 %4 = insertelement <4 x float> %3, float %1, i32 2 767 %5 = insertelement <4 x float> %4, float %1, i32 3 768 %6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 769 ret <8 x float> %6 770} 771 772define <8 x float> @splat_concat2(float* %p) { 773; X32-LABEL: splat_concat2: 774; X32: ## %bb.0: 775; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 776; X32-NEXT: vbroadcastss (%eax), %ymm0 777; X32-NEXT: retl 778; 779; X64-LABEL: splat_concat2: 780; X64: ## %bb.0: 781; X64-NEXT: vbroadcastss (%rdi), %ymm0 782; X64-NEXT: retq 783 %1 = load float, float* %p, align 4 784 %2 = insertelement <4 x float> undef, float %1, i32 0 785 %3 = insertelement <4 x float> %2, float %1, i32 1 786 %4 = insertelement <4 x float> %3, float %1, i32 2 787 %5 = insertelement <4 x float> %4, float %1, i32 3 788 %6 = insertelement <4 x float> undef, float %1, i32 0 789 %7 = insertelement <4 x float> %6, float %1, i32 1 790 %8 = insertelement <4 x float> %7, float %1, i32 2 791 %9 = insertelement <4 x float> %8, float %1, i32 3 792 %10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 793 ret <8 x float> %10 794} 795 796define <4 x double> @splat_concat3(double* %p) { 797; X32-LABEL: splat_concat3: 798; X32: ## %bb.0: 799; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 800; X32-NEXT: vbroadcastsd (%eax), %ymm0 801; X32-NEXT: retl 802; 803; X64-LABEL: splat_concat3: 804; X64: ## %bb.0: 805; X64-NEXT: vbroadcastsd (%rdi), %ymm0 806; X64-NEXT: retq 807 %1 = load double, double* %p, align 8 808 %2 = insertelement <2 x double> undef, double %1, i32 0 809 %3 = insertelement <2 x double> %2, double %1, i32 1 810 %4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 811 ret <4 x double> %4 812} 813 814define <4 x double> @splat_concat4(double* %p) { 815; X32-LABEL: splat_concat4: 816; X32: ## %bb.0: 817; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 818; X32-NEXT: vbroadcastsd (%eax), %ymm0 819; X32-NEXT: retl 820; 821; X64-LABEL: splat_concat4: 822; X64: ## %bb.0: 823; X64-NEXT: vbroadcastsd (%rdi), %ymm0 824; X64-NEXT: retq 825 %1 = load double, double* %p, align 8 826 %2 = insertelement <2 x double> undef, double %1, i32 0 827 %3 = insertelement <2 x double> %2, double %1, i32 1 828 %4 = insertelement <2 x double> undef, double %1, i32 0 829 %5 = insertelement <2 x double> %2, double %1, i32 1 830 %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 831 ret <4 x double> %6 832} 833 834; PR34041 835define <4 x double> @broadcast_shuffle_1000(double* %p) { 836; X32-LABEL: broadcast_shuffle_1000: 837; X32: ## %bb.0: 838; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 839; X32-NEXT: vbroadcastsd (%eax), %ymm0 840; X32-NEXT: retl 841; 842; X64-LABEL: broadcast_shuffle_1000: 843; X64: ## %bb.0: 844; X64-NEXT: vbroadcastsd (%rdi), %ymm0 845; X64-NEXT: retq 846 %1 = load double, double* %p 847 %2 = insertelement <2 x double> undef, double %1, i32 0 848 %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 849 ret <4 x double> %3 850} 851 852define <4 x double> @broadcast_shuffle1032(double* %p) { 853; X32-LABEL: broadcast_shuffle1032: 854; X32: ## %bb.0: 855; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 856; X32-NEXT: vbroadcastsd (%eax), %ymm0 857; X32-NEXT: retl 858; 859; X64-LABEL: broadcast_shuffle1032: 860; X64: ## %bb.0: 861; X64-NEXT: vbroadcastsd (%rdi), %ymm0 862; X64-NEXT: retq 863 %1 = load double, double* %p 864 %2 = insertelement <2 x double> undef, double %1, i32 1 865 %3 = insertelement <2 x double> undef, double %1, i32 0 866 %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 867 ret <4 x double> %4 868} 869 870; 871; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies. 872; 873define float @broadcast_lifetime() nounwind { 874; X32-LABEL: broadcast_lifetime: 875; X32: ## %bb.0: 876; X32-NEXT: pushl %esi 877; X32-NEXT: subl $40, %esp 878; X32-NEXT: leal {{[0-9]+}}(%esp), %esi 879; X32-NEXT: movl %esi, (%esp) 880; X32-NEXT: calll _gfunc 881; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 882; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## 4-byte Spill 883; X32-NEXT: movl %esi, (%esp) 884; X32-NEXT: calll _gfunc 885; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 886; X32-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 4-byte Folded Reload 887; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 888; X32-NEXT: flds {{[0-9]+}}(%esp) 889; X32-NEXT: addl $40, %esp 890; X32-NEXT: popl %esi 891; X32-NEXT: retl 892; 893; X64-LABEL: broadcast_lifetime: 894; X64: ## %bb.0: 895; X64-NEXT: subq $40, %rsp 896; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 897; X64-NEXT: callq _gfunc 898; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 899; X64-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) ## 4-byte Spill 900; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi 901; X64-NEXT: callq _gfunc 902; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 903; X64-NEXT: vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload 904; X64-NEXT: addq $40, %rsp 905; X64-NEXT: retq 906 %1 = alloca <4 x float>, align 16 907 %2 = alloca <4 x float>, align 16 908 %3 = bitcast <4 x float>* %1 to i8* 909 %4 = bitcast <4 x float>* %2 to i8* 910 911 call void @llvm.lifetime.start.p0i8(i64 16, i8* %3) 912 call void @gfunc(<4 x float>* %1) 913 %5 = load <4 x float>, <4 x float>* %1, align 16 914 call void @llvm.lifetime.end.p0i8(i64 16, i8* %3) 915 916 call void @llvm.lifetime.start.p0i8(i64 16, i8* %4) 917 call void @gfunc(<4 x float>* %2) 918 %6 = load <4 x float>, <4 x float>* %2, align 16 919 call void @llvm.lifetime.end.p0i8(i64 16, i8* %4) 920 921 %7 = extractelement <4 x float> %5, i32 1 922 %8 = extractelement <4 x float> %6, i32 1 923 %9 = fsub float %8, %7 924 ret float %9 925} 926 927declare void @gfunc(<4 x float>*) 928declare void @llvm.lifetime.start.p0i8(i64, i8*) 929declare void @llvm.lifetime.end.p0i8(i64, i8*) 930