1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 4 5define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { 6; X32-LABEL: test_broadcast_2f64_4f64: 7; X32: # %bb.0: 8; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 9; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 10; X32-NEXT: retl 11; 12; X64-LABEL: test_broadcast_2f64_4f64: 13; X64: # %bb.0: 14; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 15; X64-NEXT: retq 16 %1 = load <2 x double>, <2 x double> *%p 17 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 18 ret <4 x double> %2 19} 20 21define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { 22; X32-LABEL: test_broadcast_2i64_4i64: 23; X32: # %bb.0: 24; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 25; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 26; X32-NEXT: retl 27; 28; X64-LABEL: test_broadcast_2i64_4i64: 29; X64: # %bb.0: 30; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 31; X64-NEXT: retq 32 %1 = load <2 x i64>, <2 x i64> *%p 33 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 34 ret <4 x i64> %2 35} 36 37define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { 38; X32-LABEL: test_broadcast_4f32_8f32: 39; X32: # %bb.0: 40; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 41; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 42; X32-NEXT: retl 43; 44; X64-LABEL: test_broadcast_4f32_8f32: 45; X64: # %bb.0: 46; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 47; X64-NEXT: retq 48 %1 = load <4 x float>, <4 x float> *%p 49 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 50 ret <8 x float> %2 51} 52 53define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { 54; X32-LABEL: test_broadcast_4i32_8i32: 55; X32: # %bb.0: 56; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 57; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 58; X32-NEXT: retl 59; 60; X64-LABEL: test_broadcast_4i32_8i32: 61; X64: # %bb.0: 62; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 63; X64-NEXT: retq 64 %1 = load <4 x i32>, <4 x i32> *%p 65 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 66 ret <8 x i32> %2 67} 68 69define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { 70; X32-LABEL: test_broadcast_8i16_16i16: 71; X32: # %bb.0: 72; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 73; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 74; X32-NEXT: retl 75; 76; X64-LABEL: test_broadcast_8i16_16i16: 77; X64: # %bb.0: 78; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 79; X64-NEXT: retq 80 %1 = load <8 x i16>, <8 x i16> *%p 81 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 82 ret <16 x i16> %2 83} 84 85define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { 86; X32-LABEL: test_broadcast_16i8_32i8: 87; X32: # %bb.0: 88; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 89; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 90; X32-NEXT: retl 91; 92; X64-LABEL: test_broadcast_16i8_32i8: 93; X64: # %bb.0: 94; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 95; X64-NEXT: retq 96 %1 = load <16 x i8>, <16 x i8> *%p 97 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 98 ret <32 x i8> %2 99} 100 101; PR38949 - https://bugs.llvm.org/show_bug.cgi?id=38949 102; Don't limit the transform based on extra uses of the load itself (the store is a user of the load's chain value). 103 104define void @subv_reuse_is_ok(<4 x float>* %a, <8 x float>* %b) { 105; X32-LABEL: subv_reuse_is_ok: 106; X32: # %bb.0: 107; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 108; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 109; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 110; X32-NEXT: vmovups %ymm0, (%eax) 111; X32-NEXT: vzeroupper 112; X32-NEXT: retl 113; 114; X64-LABEL: subv_reuse_is_ok: 115; X64: # %bb.0: 116; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 117; X64-NEXT: vmovups %ymm0, (%rsi) 118; X64-NEXT: vzeroupper 119; X64-NEXT: retq 120 %ld = load <4 x float>, <4 x float>* %a, align 1 121 %splat128 = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 122 store <8 x float> %splat128, <8 x float>* %b, align 16 123 ret void 124} 125 126define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { 127; X32-LABEL: test_broadcast_2f64_4f64_reuse: 128; X32: # %bb.0: 129; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 130; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 131; X32-NEXT: vmovaps (%ecx), %xmm1 132; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 133; X32-NEXT: vmovaps %xmm1, (%eax) 134; X32-NEXT: retl 135; 136; X64-LABEL: test_broadcast_2f64_4f64_reuse: 137; X64: # %bb.0: 138; X64-NEXT: vmovaps (%rdi), %xmm1 139; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 140; X64-NEXT: vmovaps %xmm1, (%rsi) 141; X64-NEXT: retq 142 %1 = load <2 x double>, <2 x double>* %p0 143 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 144 store <2 x double> %1, <2 x double>* %p1 145 ret <4 x double> %2 146} 147 148define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { 149; X32-LABEL: test_broadcast_2i64_4i64_reuse: 150; X32: # %bb.0: 151; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 152; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 153; X32-NEXT: vmovaps (%ecx), %xmm1 154; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 155; X32-NEXT: vmovaps %xmm1, (%eax) 156; X32-NEXT: retl 157; 158; X64-LABEL: test_broadcast_2i64_4i64_reuse: 159; X64: # %bb.0: 160; X64-NEXT: vmovaps (%rdi), %xmm1 161; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 162; X64-NEXT: vmovaps %xmm1, (%rsi) 163; X64-NEXT: retq 164 %1 = load <2 x i64>, <2 x i64>* %p0 165 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 166 store <2 x i64> %1, <2 x i64>* %p1 167 ret <4 x i64> %2 168} 169 170define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { 171; X32-LABEL: test_broadcast_4f32_8f32_reuse: 172; X32: # %bb.0: 173; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 174; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 175; X32-NEXT: vmovaps (%ecx), %xmm1 176; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 177; X32-NEXT: vmovaps %xmm1, (%eax) 178; X32-NEXT: retl 179; 180; X64-LABEL: test_broadcast_4f32_8f32_reuse: 181; X64: # %bb.0: 182; X64-NEXT: vmovaps (%rdi), %xmm1 183; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 184; X64-NEXT: vmovaps %xmm1, (%rsi) 185; X64-NEXT: retq 186 %1 = load <4 x float>, <4 x float>* %p0 187 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 188 store <4 x float> %1, <4 x float>* %p1 189 ret <8 x float> %2 190} 191 192define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { 193; X32-LABEL: test_broadcast_4i32_8i32_reuse: 194; X32: # %bb.0: 195; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 196; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 197; X32-NEXT: vmovaps (%ecx), %xmm1 198; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 199; X32-NEXT: vmovaps %xmm1, (%eax) 200; X32-NEXT: retl 201; 202; X64-LABEL: test_broadcast_4i32_8i32_reuse: 203; X64: # %bb.0: 204; X64-NEXT: vmovaps (%rdi), %xmm1 205; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 206; X64-NEXT: vmovaps %xmm1, (%rsi) 207; X64-NEXT: retq 208 %1 = load <4 x i32>, <4 x i32>* %p0 209 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 210 store <4 x i32> %1, <4 x i32>* %p1 211 ret <8 x i32> %2 212} 213 214define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { 215; X32-LABEL: test_broadcast_8i16_16i16_reuse: 216; X32: # %bb.0: 217; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 218; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 219; X32-NEXT: vmovaps (%ecx), %xmm1 220; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 221; X32-NEXT: vmovaps %xmm1, (%eax) 222; X32-NEXT: retl 223; 224; X64-LABEL: test_broadcast_8i16_16i16_reuse: 225; X64: # %bb.0: 226; X64-NEXT: vmovaps (%rdi), %xmm1 227; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 228; X64-NEXT: vmovaps %xmm1, (%rsi) 229; X64-NEXT: retq 230 %1 = load <8 x i16>, <8 x i16> *%p0 231 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 232 store <8 x i16> %1, <8 x i16>* %p1 233 ret <16 x i16> %2 234} 235 236define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { 237; X32-LABEL: test_broadcast_16i8_32i8_reuse: 238; X32: # %bb.0: 239; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 240; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 241; X32-NEXT: vmovaps (%ecx), %xmm1 242; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 243; X32-NEXT: vmovaps %xmm1, (%eax) 244; X32-NEXT: retl 245; 246; X64-LABEL: test_broadcast_16i8_32i8_reuse: 247; X64: # %bb.0: 248; X64-NEXT: vmovaps (%rdi), %xmm1 249; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0 250; X64-NEXT: vmovaps %xmm1, (%rsi) 251; X64-NEXT: retq 252 %1 = load <16 x i8>, <16 x i8> *%p0 253 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 254 store <16 x i8> %1, <16 x i8>* %p1 255 ret <32 x i8> %2 256} 257 258define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) { 259; X32-LABEL: PR29088: 260; X32: # %bb.0: 261; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 262; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 263; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 264; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 265; X32-NEXT: vmovaps %ymm1, (%eax) 266; X32-NEXT: retl 267; 268; X64-LABEL: PR29088: 269; X64: # %bb.0: 270; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 271; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 272; X64-NEXT: vmovaps %ymm1, (%rsi) 273; X64-NEXT: retq 274 %ld = load <4 x i32>, <4 x i32>* %p0 275 store <8 x float> zeroinitializer, <8 x float>* %p1 276 %shuf = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 277 ret <8 x i32> %shuf 278} 279