1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 12 13; 14; Subvector Load + Broadcast 15; 16 17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { 18; X86-LABEL: test_broadcast_2f64_4f64: 19; X86: # %bb.0: 20; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 21; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 22; X86-NEXT: retl 23; 24; X64-LABEL: test_broadcast_2f64_4f64: 25; X64: # %bb.0: 26; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 27; X64-NEXT: retq 28 %1 = load <2 x double>, <2 x double> *%p 29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 30 ret <4 x double> %2 31} 32 33define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { 34; X86-AVX-LABEL: test_broadcast_2f64_8f64: 35; X86-AVX: # %bb.0: 36; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 37; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 38; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 39; X86-AVX-NEXT: retl 40; 41; X86-AVX512-LABEL: test_broadcast_2f64_8f64: 42; X86-AVX512: # %bb.0: 43; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 44; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 45; X86-AVX512-NEXT: retl 46; 47; X64-AVX-LABEL: test_broadcast_2f64_8f64: 48; X64-AVX: # %bb.0: 49; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 50; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 51; X64-AVX-NEXT: retq 52; 53; X64-AVX512-LABEL: test_broadcast_2f64_8f64: 54; X64-AVX512: # %bb.0: 55; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 56; X64-AVX512-NEXT: retq 57 %1 = load <2 x double>, <2 x double> *%p 58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 59 ret <8 x double> %2 60} 61 62define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { 63; X86-AVX-LABEL: test_broadcast_4f64_8f64: 64; X86-AVX: # %bb.0: 65; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 66; X86-AVX-NEXT: vmovaps (%eax), %ymm0 67; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 68; X86-AVX-NEXT: retl 69; 70; X86-AVX512-LABEL: test_broadcast_4f64_8f64: 71; X86-AVX512: # %bb.0: 72; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 73; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 74; X86-AVX512-NEXT: retl 75; 76; X64-AVX-LABEL: test_broadcast_4f64_8f64: 77; X64-AVX: # %bb.0: 78; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 79; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 80; X64-AVX-NEXT: retq 81; 82; X64-AVX512-LABEL: test_broadcast_4f64_8f64: 83; X64-AVX512: # %bb.0: 84; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 85; X64-AVX512-NEXT: retq 86 %1 = load <4 x double>, <4 x double> *%p 87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 88 ret <8 x double> %2 89} 90 91define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { 92; X86-AVX-LABEL: test_broadcast_2i64_4i64: 93; X86-AVX: # %bb.0: 94; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 95; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 96; X86-AVX-NEXT: retl 97; 98; X86-AVX512-LABEL: test_broadcast_2i64_4i64: 99; X86-AVX512: # %bb.0: 100; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 101; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 102; X86-AVX512-NEXT: retl 103; 104; X64-AVX-LABEL: test_broadcast_2i64_4i64: 105; X64-AVX: # %bb.0: 106; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 107; X64-AVX-NEXT: retq 108; 109; X64-AVX512-LABEL: test_broadcast_2i64_4i64: 110; X64-AVX512: # %bb.0: 111; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 112; X64-AVX512-NEXT: retq 113 %1 = load <2 x i64>, <2 x i64> *%p 114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 115 ret <4 x i64> %2 116} 117 118define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { 119; X86-AVX-LABEL: test_broadcast_2i64_8i64: 120; X86-AVX: # %bb.0: 121; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 122; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 123; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 124; X86-AVX-NEXT: retl 125; 126; X86-AVX512-LABEL: test_broadcast_2i64_8i64: 127; X86-AVX512: # %bb.0: 128; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 129; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 130; X86-AVX512-NEXT: retl 131; 132; X64-AVX-LABEL: test_broadcast_2i64_8i64: 133; X64-AVX: # %bb.0: 134; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 135; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 136; X64-AVX-NEXT: retq 137; 138; X64-AVX512-LABEL: test_broadcast_2i64_8i64: 139; X64-AVX512: # %bb.0: 140; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 141; X64-AVX512-NEXT: retq 142 %1 = load <2 x i64>, <2 x i64> *%p 143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 144 ret <8 x i64> %2 145} 146 147define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { 148; X86-AVX-LABEL: test_broadcast_4i64_8i64: 149; X86-AVX: # %bb.0: 150; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 151; X86-AVX-NEXT: vmovaps (%eax), %ymm0 152; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 153; X86-AVX-NEXT: retl 154; 155; X86-AVX512-LABEL: test_broadcast_4i64_8i64: 156; X86-AVX512: # %bb.0: 157; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 158; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 159; X86-AVX512-NEXT: retl 160; 161; X64-AVX-LABEL: test_broadcast_4i64_8i64: 162; X64-AVX: # %bb.0: 163; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 164; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 165; X64-AVX-NEXT: retq 166; 167; X64-AVX512-LABEL: test_broadcast_4i64_8i64: 168; X64-AVX512: # %bb.0: 169; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 170; X64-AVX512-NEXT: retq 171 %1 = load <4 x i64>, <4 x i64> *%p 172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 173 ret <8 x i64> %2 174} 175 176define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { 177; X86-LABEL: test_broadcast_4f32_8f32: 178; X86: # %bb.0: 179; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 180; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 181; X86-NEXT: retl 182; 183; X64-LABEL: test_broadcast_4f32_8f32: 184; X64: # %bb.0: 185; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 186; X64-NEXT: retq 187 %1 = load <4 x float>, <4 x float> *%p 188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 189 ret <8 x float> %2 190} 191 192define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { 193; X86-AVX-LABEL: test_broadcast_4f32_16f32: 194; X86-AVX: # %bb.0: 195; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 196; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 197; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 198; X86-AVX-NEXT: retl 199; 200; X86-AVX512-LABEL: test_broadcast_4f32_16f32: 201; X86-AVX512: # %bb.0: 202; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 203; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 204; X86-AVX512-NEXT: retl 205; 206; X64-AVX-LABEL: test_broadcast_4f32_16f32: 207; X64-AVX: # %bb.0: 208; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 209; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 210; X64-AVX-NEXT: retq 211; 212; X64-AVX512-LABEL: test_broadcast_4f32_16f32: 213; X64-AVX512: # %bb.0: 214; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 215; X64-AVX512-NEXT: retq 216 %1 = load <4 x float>, <4 x float> *%p 217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 218 ret <16 x float> %2 219} 220 221define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { 222; X86-AVX-LABEL: test_broadcast_8f32_16f32: 223; X86-AVX: # %bb.0: 224; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 225; X86-AVX-NEXT: vmovaps (%eax), %ymm0 226; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 227; X86-AVX-NEXT: retl 228; 229; X86-AVX512-LABEL: test_broadcast_8f32_16f32: 230; X86-AVX512: # %bb.0: 231; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 232; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 233; X86-AVX512-NEXT: retl 234; 235; X64-AVX-LABEL: test_broadcast_8f32_16f32: 236; X64-AVX: # %bb.0: 237; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 238; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 239; X64-AVX-NEXT: retq 240; 241; X64-AVX512-LABEL: test_broadcast_8f32_16f32: 242; X64-AVX512: # %bb.0: 243; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 244; X64-AVX512-NEXT: retq 245 %1 = load <8 x float>, <8 x float> *%p 246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 247 ret <16 x float> %2 248} 249 250define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { 251; X86-AVX-LABEL: test_broadcast_4i32_8i32: 252; X86-AVX: # %bb.0: 253; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 254; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 255; X86-AVX-NEXT: retl 256; 257; X86-AVX512-LABEL: test_broadcast_4i32_8i32: 258; X86-AVX512: # %bb.0: 259; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 260; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 261; X86-AVX512-NEXT: retl 262; 263; X64-AVX-LABEL: test_broadcast_4i32_8i32: 264; X64-AVX: # %bb.0: 265; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 266; X64-AVX-NEXT: retq 267; 268; X64-AVX512-LABEL: test_broadcast_4i32_8i32: 269; X64-AVX512: # %bb.0: 270; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 271; X64-AVX512-NEXT: retq 272 %1 = load <4 x i32>, <4 x i32> *%p 273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 274 ret <8 x i32> %2 275} 276 277define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { 278; X86-AVX-LABEL: test_broadcast_4i32_16i32: 279; X86-AVX: # %bb.0: 280; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 281; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 282; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 283; X86-AVX-NEXT: retl 284; 285; X86-AVX512-LABEL: test_broadcast_4i32_16i32: 286; X86-AVX512: # %bb.0: 287; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 288; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 289; X86-AVX512-NEXT: retl 290; 291; X64-AVX-LABEL: test_broadcast_4i32_16i32: 292; X64-AVX: # %bb.0: 293; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 294; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 295; X64-AVX-NEXT: retq 296; 297; X64-AVX512-LABEL: test_broadcast_4i32_16i32: 298; X64-AVX512: # %bb.0: 299; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 300; X64-AVX512-NEXT: retq 301 %1 = load <4 x i32>, <4 x i32> *%p 302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 303 ret <16 x i32> %2 304} 305 306define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { 307; X86-AVX-LABEL: test_broadcast_8i32_16i32: 308; X86-AVX: # %bb.0: 309; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 310; X86-AVX-NEXT: vmovaps (%eax), %ymm0 311; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 312; X86-AVX-NEXT: retl 313; 314; X86-AVX512-LABEL: test_broadcast_8i32_16i32: 315; X86-AVX512: # %bb.0: 316; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 317; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 318; X86-AVX512-NEXT: retl 319; 320; X64-AVX-LABEL: test_broadcast_8i32_16i32: 321; X64-AVX: # %bb.0: 322; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 323; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 324; X64-AVX-NEXT: retq 325; 326; X64-AVX512-LABEL: test_broadcast_8i32_16i32: 327; X64-AVX512: # %bb.0: 328; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 329; X64-AVX512-NEXT: retq 330 %1 = load <8 x i32>, <8 x i32> *%p 331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 332 ret <16 x i32> %2 333} 334 335define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { 336; X86-AVX-LABEL: test_broadcast_8i16_16i16: 337; X86-AVX: # %bb.0: 338; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 339; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 340; X86-AVX-NEXT: retl 341; 342; X86-AVX512-LABEL: test_broadcast_8i16_16i16: 343; X86-AVX512: # %bb.0: 344; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 345; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 346; X86-AVX512-NEXT: retl 347; 348; X64-AVX-LABEL: test_broadcast_8i16_16i16: 349; X64-AVX: # %bb.0: 350; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 351; X64-AVX-NEXT: retq 352; 353; X64-AVX512-LABEL: test_broadcast_8i16_16i16: 354; X64-AVX512: # %bb.0: 355; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 356; X64-AVX512-NEXT: retq 357 %1 = load <8 x i16>, <8 x i16> *%p 358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 359 ret <16 x i16> %2 360} 361 362define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { 363; X86-AVX-LABEL: test_broadcast_8i16_32i16: 364; X86-AVX: # %bb.0: 365; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 366; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 367; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 368; X86-AVX-NEXT: retl 369; 370; X86-AVX512-LABEL: test_broadcast_8i16_32i16: 371; X86-AVX512: # %bb.0: 372; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 373; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 374; X86-AVX512-NEXT: retl 375; 376; X64-AVX-LABEL: test_broadcast_8i16_32i16: 377; X64-AVX: # %bb.0: 378; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 379; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 380; X64-AVX-NEXT: retq 381; 382; X64-AVX512-LABEL: test_broadcast_8i16_32i16: 383; X64-AVX512: # %bb.0: 384; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 385; X64-AVX512-NEXT: retq 386 %1 = load <8 x i16>, <8 x i16> *%p 387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 388 ret <32 x i16> %2 389} 390 391define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { 392; X86-AVX-LABEL: test_broadcast_16i16_32i16: 393; X86-AVX: # %bb.0: 394; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 395; X86-AVX-NEXT: vmovaps (%eax), %ymm0 396; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 397; X86-AVX-NEXT: retl 398; 399; X86-AVX512-LABEL: test_broadcast_16i16_32i16: 400; X86-AVX512: # %bb.0: 401; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 402; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 403; X86-AVX512-NEXT: retl 404; 405; X64-AVX-LABEL: test_broadcast_16i16_32i16: 406; X64-AVX: # %bb.0: 407; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 408; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 409; X64-AVX-NEXT: retq 410; 411; X64-AVX512-LABEL: test_broadcast_16i16_32i16: 412; X64-AVX512: # %bb.0: 413; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 414; X64-AVX512-NEXT: retq 415 %1 = load <16 x i16>, <16 x i16> *%p 416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 417 ret <32 x i16> %2 418} 419 420define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { 421; X86-AVX-LABEL: test_broadcast_16i8_32i8: 422; X86-AVX: # %bb.0: 423; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 424; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 425; X86-AVX-NEXT: retl 426; 427; X86-AVX512-LABEL: test_broadcast_16i8_32i8: 428; X86-AVX512: # %bb.0: 429; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 430; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 431; X86-AVX512-NEXT: retl 432; 433; X64-AVX-LABEL: test_broadcast_16i8_32i8: 434; X64-AVX: # %bb.0: 435; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 436; X64-AVX-NEXT: retq 437; 438; X64-AVX512-LABEL: test_broadcast_16i8_32i8: 439; X64-AVX512: # %bb.0: 440; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 441; X64-AVX512-NEXT: retq 442 %1 = load <16 x i8>, <16 x i8> *%p 443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 444 ret <32 x i8> %2 445} 446 447define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { 448; X86-AVX-LABEL: test_broadcast_16i8_64i8: 449; X86-AVX: # %bb.0: 450; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 451; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 452; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 453; X86-AVX-NEXT: retl 454; 455; X86-AVX512-LABEL: test_broadcast_16i8_64i8: 456; X86-AVX512: # %bb.0: 457; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 458; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 459; X86-AVX512-NEXT: retl 460; 461; X64-AVX-LABEL: test_broadcast_16i8_64i8: 462; X64-AVX: # %bb.0: 463; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 464; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 465; X64-AVX-NEXT: retq 466; 467; X64-AVX512-LABEL: test_broadcast_16i8_64i8: 468; X64-AVX512: # %bb.0: 469; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 470; X64-AVX512-NEXT: retq 471 %1 = load <16 x i8>, <16 x i8> *%p 472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 473 ret <64 x i8> %2 474} 475 476define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { 477; X86-AVX-LABEL: test_broadcast_32i8_64i8: 478; X86-AVX: # %bb.0: 479; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 480; X86-AVX-NEXT: vmovaps (%eax), %ymm0 481; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 482; X86-AVX-NEXT: retl 483; 484; X86-AVX512-LABEL: test_broadcast_32i8_64i8: 485; X86-AVX512: # %bb.0: 486; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 487; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 488; X86-AVX512-NEXT: retl 489; 490; X64-AVX-LABEL: test_broadcast_32i8_64i8: 491; X64-AVX: # %bb.0: 492; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 493; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 494; X64-AVX-NEXT: retq 495; 496; X64-AVX512-LABEL: test_broadcast_32i8_64i8: 497; X64-AVX512: # %bb.0: 498; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 499; X64-AVX512-NEXT: retq 500 %1 = load <32 x i8>, <32 x i8> *%p 501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 502 ret <64 x i8> %2 503} 504 505; 506; Subvector Load + Broadcast + Store 507; 508 509define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { 510; X86-LABEL: test_broadcast_2f64_4f64_reuse: 511; X86: # %bb.0: 512; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 513; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 514; X86-NEXT: vmovaps (%ecx), %xmm0 515; X86-NEXT: vmovaps %xmm0, (%eax) 516; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 517; X86-NEXT: retl 518; 519; X64-LABEL: test_broadcast_2f64_4f64_reuse: 520; X64: # %bb.0: 521; X64-NEXT: vmovaps (%rdi), %xmm0 522; X64-NEXT: vmovaps %xmm0, (%rsi) 523; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 524; X64-NEXT: retq 525 %1 = load <2 x double>, <2 x double>* %p0 526 store <2 x double> %1, <2 x double>* %p1 527 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 528 ret <4 x double> %2 529} 530 531define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { 532; X86-LABEL: test_broadcast_2i64_4i64_reuse: 533; X86: # %bb.0: 534; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 535; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 536; X86-NEXT: vmovaps (%ecx), %xmm0 537; X86-NEXT: vmovaps %xmm0, (%eax) 538; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 539; X86-NEXT: retl 540; 541; X64-LABEL: test_broadcast_2i64_4i64_reuse: 542; X64: # %bb.0: 543; X64-NEXT: vmovaps (%rdi), %xmm0 544; X64-NEXT: vmovaps %xmm0, (%rsi) 545; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 546; X64-NEXT: retq 547 %1 = load <2 x i64>, <2 x i64>* %p0 548 store <2 x i64> %1, <2 x i64>* %p1 549 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 550 ret <4 x i64> %2 551} 552 553define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { 554; X86-LABEL: test_broadcast_4f32_8f32_reuse: 555; X86: # %bb.0: 556; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 557; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 558; X86-NEXT: vmovaps (%ecx), %xmm0 559; X86-NEXT: vmovaps %xmm0, (%eax) 560; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 561; X86-NEXT: retl 562; 563; X64-LABEL: test_broadcast_4f32_8f32_reuse: 564; X64: # %bb.0: 565; X64-NEXT: vmovaps (%rdi), %xmm0 566; X64-NEXT: vmovaps %xmm0, (%rsi) 567; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 568; X64-NEXT: retq 569 %1 = load <4 x float>, <4 x float>* %p0 570 store <4 x float> %1, <4 x float>* %p1 571 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 572 ret <8 x float> %2 573} 574 575define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { 576; X86-LABEL: test_broadcast_4i32_8i32_reuse: 577; X86: # %bb.0: 578; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 579; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 580; X86-NEXT: vmovaps (%ecx), %xmm0 581; X86-NEXT: vmovaps %xmm0, (%eax) 582; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 583; X86-NEXT: retl 584; 585; X64-LABEL: test_broadcast_4i32_8i32_reuse: 586; X64: # %bb.0: 587; X64-NEXT: vmovaps (%rdi), %xmm0 588; X64-NEXT: vmovaps %xmm0, (%rsi) 589; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 590; X64-NEXT: retq 591 %1 = load <4 x i32>, <4 x i32>* %p0 592 store <4 x i32> %1, <4 x i32>* %p1 593 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 594 ret <8 x i32> %2 595} 596 597define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { 598; X86-LABEL: test_broadcast_8i16_16i16_reuse: 599; X86: # %bb.0: 600; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 601; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 602; X86-NEXT: vmovaps (%ecx), %xmm0 603; X86-NEXT: vmovaps %xmm0, (%eax) 604; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 605; X86-NEXT: retl 606; 607; X64-LABEL: test_broadcast_8i16_16i16_reuse: 608; X64: # %bb.0: 609; X64-NEXT: vmovaps (%rdi), %xmm0 610; X64-NEXT: vmovaps %xmm0, (%rsi) 611; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 612; X64-NEXT: retq 613 %1 = load <8 x i16>, <8 x i16> *%p0 614 store <8 x i16> %1, <8 x i16>* %p1 615 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 616 ret <16 x i16> %2 617} 618 619define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { 620; X86-LABEL: test_broadcast_16i8_32i8_reuse: 621; X86: # %bb.0: 622; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 623; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 624; X86-NEXT: vmovaps (%ecx), %xmm0 625; X86-NEXT: vmovaps %xmm0, (%eax) 626; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 627; X86-NEXT: retl 628; 629; X64-LABEL: test_broadcast_16i8_32i8_reuse: 630; X64: # %bb.0: 631; X64-NEXT: vmovaps (%rdi), %xmm0 632; X64-NEXT: vmovaps %xmm0, (%rsi) 633; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 634; X64-NEXT: retq 635 %1 = load <16 x i8>, <16 x i8> *%p0 636 store <16 x i8> %1, <16 x i8>* %p1 637 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 638 ret <32 x i8> %2 639} 640 641; 642; Subvector Load + Broadcast with Separate Store 643; 644 645define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 646; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain: 647; X86-AVX: # %bb.0: 648; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 649; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 650; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 651; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 652; X86-AVX-NEXT: vmovaps %xmm1, (%eax) 653; X86-AVX-NEXT: retl 654; 655; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 656; X86-AVX512: # %bb.0: 657; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 658; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 659; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 660; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 661; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) 662; X86-AVX512-NEXT: retl 663; 664; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: 665; X64-AVX: # %bb.0: 666; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 667; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 668; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 669; X64-AVX-NEXT: retq 670; 671; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 672; X64-AVX512: # %bb.0: 673; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 674; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 675; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 676; X64-AVX512-NEXT: retq 677 %1 = load <4 x i32>, <4 x i32>* %p0 678 store <4 x float> zeroinitializer, <4 x float>* %p1 679 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 680 ret <8 x i32> %2 681} 682 683define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 684; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain: 685; X86-AVX: # %bb.0: 686; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 687; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 688; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 689; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 690; X86-AVX-NEXT: vmovaps %xmm1, (%eax) 691; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 692; X86-AVX-NEXT: retl 693; 694; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 695; X86-AVX512: # %bb.0: 696; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 697; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 698; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 699; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 700; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) 701; X86-AVX512-NEXT: retl 702; 703; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: 704; X64-AVX: # %bb.0: 705; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 706; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 707; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 708; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 709; X64-AVX-NEXT: retq 710; 711; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 712; X64-AVX512: # %bb.0: 713; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 714; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 715; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 716; X64-AVX512-NEXT: retq 717 %1 = load <4 x i32>, <4 x i32>* %p0 718 store <4 x float> zeroinitializer, <4 x float>* %p1 719 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 720 ret <16 x i32> %2 721} 722 723; 724; subvector Load with multiple uses + broadcast 725; Fallback to the broadcast should be done 726; 727 728@ga4 = global <4 x i64> zeroinitializer, align 8 729@gb4 = global <8 x i64> zeroinitializer, align 8 730 731define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { 732; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 733; X86-AVX1: # %bb.0: # %entry 734; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,2,0] 735; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 736; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 737; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] 738; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 739; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0] 740; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 741; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 742; X86-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 743; X86-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 744; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 745; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 746; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 747; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 748; X86-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 749; X86-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 750; X86-AVX1-NEXT: vmovdqu %xmm0, ga4+16 751; X86-AVX1-NEXT: vmovdqu %xmm4, ga4 752; X86-AVX1-NEXT: vmovups %ymm2, gb4+32 753; X86-AVX1-NEXT: vmovups %ymm1, gb4 754; X86-AVX1-NEXT: vzeroupper 755; X86-AVX1-NEXT: retl 756; 757; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 758; X86-AVX2: # %bb.0: # %entry 759; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] 760; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 761; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 762; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 763; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 764; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 765; X86-AVX2-NEXT: vmovdqu %ymm0, ga4 766; X86-AVX2-NEXT: vmovdqu %ymm2, gb4+32 767; X86-AVX2-NEXT: vmovdqu %ymm1, gb4 768; X86-AVX2-NEXT: vzeroupper 769; X86-AVX2-NEXT: retl 770; 771; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 772; X86-AVX512: # %bb.0: # %entry 773; X86-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,0,3,0,4,0] 774; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 775; X86-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 776; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 777; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 778; X86-AVX512-NEXT: vmovdqu %ymm0, ga4 779; X86-AVX512-NEXT: vmovdqu64 %zmm1, gb4 780; X86-AVX512-NEXT: vzeroupper 781; X86-AVX512-NEXT: retl 782; 783; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 784; X64-AVX1: # %bb.0: # %entry 785; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2] 786; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 787; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 788; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4] 789; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 790; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,2,3,4] 791; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 792; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 793; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 794; X64-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 795; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 796; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 797; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 798; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 799; X64-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 800; X64-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 801; X64-AVX1-NEXT: vmovdqu %xmm0, ga4+{{.*}}(%rip) 802; X64-AVX1-NEXT: vmovdqu %xmm4, {{.*}}(%rip) 803; X64-AVX1-NEXT: vmovups %ymm2, gb4+{{.*}}(%rip) 804; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip) 805; X64-AVX1-NEXT: vzeroupper 806; X64-AVX1-NEXT: retq 807; 808; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 809; X64-AVX2: # %bb.0: # %entry 810; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] 811; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 812; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 813; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 814; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 815; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 816; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip) 817; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+{{.*}}(%rip) 818; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip) 819; X64-AVX2-NEXT: vzeroupper 820; X64-AVX2-NEXT: retq 821; 822; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 823; X64-AVX512: # %bb.0: # %entry 824; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4] 825; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 826; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 827; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 828; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 829; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip) 830; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip) 831; X64-AVX512-NEXT: vzeroupper 832; X64-AVX512-NEXT: retq 833entry: 834 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4> 835 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 836 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 837 store <4 x i64> %0, <4 x i64>* @ga4, align 8 838 store <8 x i64> %2, <8 x i64>* @gb4, align 8 839 ret void 840} 841 842 843@ga2 = global <4 x double> zeroinitializer, align 8 844@gb2 = global <8 x double> zeroinitializer, align 8 845 846define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) { 847; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 848; X86-AVX: # %bb.0: # %entry 849; X86-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 850; X86-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 851; X86-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 852; X86-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 853; X86-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 854; X86-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 855; X86-AVX-NEXT: vmovupd %ymm0, ga2 856; X86-AVX-NEXT: vmovupd %ymm2, gb2+32 857; X86-AVX-NEXT: vmovupd %ymm1, gb2 858; X86-AVX-NEXT: vzeroupper 859; X86-AVX-NEXT: retl 860; 861; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 862; X86-AVX512: # %bb.0: # %entry 863; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 864; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 865; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 866; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 867; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 868; X86-AVX512-NEXT: vmovupd %ymm0, ga2 869; X86-AVX512-NEXT: vmovupd %zmm1, gb2 870; X86-AVX512-NEXT: vzeroupper 871; X86-AVX512-NEXT: retl 872; 873; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 874; X64-AVX: # %bb.0: # %entry 875; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 876; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 877; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 878; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 879; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 880; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 881; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip) 882; X64-AVX-NEXT: vmovupd %ymm2, gb2+{{.*}}(%rip) 883; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip) 884; X64-AVX-NEXT: vzeroupper 885; X64-AVX-NEXT: retq 886; 887; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 888; X64-AVX512: # %bb.0: # %entry 889; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 890; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 891; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 892; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 893; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 894; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip) 895; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip) 896; X64-AVX512-NEXT: vzeroupper 897; X64-AVX512-NEXT: retq 898entry: 899 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0> 900 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 901 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 902 store <4 x double> %0, <4 x double>* @ga2, align 8 903 store <8 x double> %2, <8 x double>* @gb2, align 8 904 ret void 905} 906 907; 908; Subvector Broadcast from register 909; 910 911define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind { 912; X86-LABEL: reg_broadcast_2f64_4f64: 913; X86: # %bb.0: 914; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 915; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 916; X86-NEXT: retl 917; 918; X64-LABEL: reg_broadcast_2f64_4f64: 919; X64: # %bb.0: 920; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 921; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 922; X64-NEXT: retq 923 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 924 ret <4 x double> %1 925} 926 927define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind { 928; X86-AVX-LABEL: reg_broadcast_2f64_8f64: 929; X86-AVX: # %bb.0: 930; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 931; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 932; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 933; X86-AVX-NEXT: retl 934; 935; X86-AVX512-LABEL: reg_broadcast_2f64_8f64: 936; X86-AVX512: # %bb.0: 937; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 938; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 939; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 940; X86-AVX512-NEXT: retl 941; 942; X64-AVX-LABEL: reg_broadcast_2f64_8f64: 943; X64-AVX: # %bb.0: 944; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 945; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 946; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 947; X64-AVX-NEXT: retq 948; 949; X64-AVX512-LABEL: reg_broadcast_2f64_8f64: 950; X64-AVX512: # %bb.0: 951; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 952; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 953; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 954; X64-AVX512-NEXT: retq 955 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 956 ret <8 x double> %1 957} 958 959define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind { 960; X86-AVX-LABEL: reg_broadcast_4f64_8f64: 961; X86-AVX: # %bb.0: 962; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 963; X86-AVX-NEXT: retl 964; 965; X86-AVX512-LABEL: reg_broadcast_4f64_8f64: 966; X86-AVX512: # %bb.0: 967; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 968; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 969; X86-AVX512-NEXT: retl 970; 971; X64-AVX-LABEL: reg_broadcast_4f64_8f64: 972; X64-AVX: # %bb.0: 973; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 974; X64-AVX-NEXT: retq 975; 976; X64-AVX512-LABEL: reg_broadcast_4f64_8f64: 977; X64-AVX512: # %bb.0: 978; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 979; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 980; X64-AVX512-NEXT: retq 981 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 982 ret <8 x double> %1 983} 984 985define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind { 986; X86-LABEL: reg_broadcast_2i64_4i64: 987; X86: # %bb.0: 988; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 989; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 990; X86-NEXT: retl 991; 992; X64-LABEL: reg_broadcast_2i64_4i64: 993; X64: # %bb.0: 994; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 995; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 996; X64-NEXT: retq 997 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 998 ret <4 x i64> %1 999} 1000 1001define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind { 1002; X86-AVX-LABEL: reg_broadcast_2i64_8i64: 1003; X86-AVX: # %bb.0: 1004; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1005; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1006; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1007; X86-AVX-NEXT: retl 1008; 1009; X86-AVX512-LABEL: reg_broadcast_2i64_8i64: 1010; X86-AVX512: # %bb.0: 1011; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1012; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1013; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1014; X86-AVX512-NEXT: retl 1015; 1016; X64-AVX-LABEL: reg_broadcast_2i64_8i64: 1017; X64-AVX: # %bb.0: 1018; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1019; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1020; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1021; X64-AVX-NEXT: retq 1022; 1023; X64-AVX512-LABEL: reg_broadcast_2i64_8i64: 1024; X64-AVX512: # %bb.0: 1025; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1026; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1027; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1028; X64-AVX512-NEXT: retq 1029 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1030 ret <8 x i64> %1 1031} 1032 1033define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind { 1034; X86-AVX-LABEL: reg_broadcast_4i64_8i64: 1035; X86-AVX: # %bb.0: 1036; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1037; X86-AVX-NEXT: retl 1038; 1039; X86-AVX512-LABEL: reg_broadcast_4i64_8i64: 1040; X86-AVX512: # %bb.0: 1041; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1042; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1043; X86-AVX512-NEXT: retl 1044; 1045; X64-AVX-LABEL: reg_broadcast_4i64_8i64: 1046; X64-AVX: # %bb.0: 1047; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1048; X64-AVX-NEXT: retq 1049; 1050; X64-AVX512-LABEL: reg_broadcast_4i64_8i64: 1051; X64-AVX512: # %bb.0: 1052; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1053; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1054; X64-AVX512-NEXT: retq 1055 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1056 ret <8 x i64> %1 1057} 1058 1059define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind { 1060; X86-LABEL: reg_broadcast_4f32_8f32: 1061; X86: # %bb.0: 1062; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1063; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1064; X86-NEXT: retl 1065; 1066; X64-LABEL: reg_broadcast_4f32_8f32: 1067; X64: # %bb.0: 1068; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1069; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1070; X64-NEXT: retq 1071 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1072 ret <8 x float> %1 1073} 1074 1075define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind { 1076; X86-AVX-LABEL: reg_broadcast_4f32_16f32: 1077; X86-AVX: # %bb.0: 1078; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1079; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1080; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1081; X86-AVX-NEXT: retl 1082; 1083; X86-AVX512-LABEL: reg_broadcast_4f32_16f32: 1084; X86-AVX512: # %bb.0: 1085; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1086; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1087; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1088; X86-AVX512-NEXT: retl 1089; 1090; X64-AVX-LABEL: reg_broadcast_4f32_16f32: 1091; X64-AVX: # %bb.0: 1092; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1093; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1094; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1095; X64-AVX-NEXT: retq 1096; 1097; X64-AVX512-LABEL: reg_broadcast_4f32_16f32: 1098; X64-AVX512: # %bb.0: 1099; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1100; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1101; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1102; X64-AVX512-NEXT: retq 1103 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1104 ret <16 x float> %1 1105} 1106 1107define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind { 1108; X86-AVX-LABEL: reg_broadcast_8f32_16f32: 1109; X86-AVX: # %bb.0: 1110; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1111; X86-AVX-NEXT: retl 1112; 1113; X86-AVX512-LABEL: reg_broadcast_8f32_16f32: 1114; X86-AVX512: # %bb.0: 1115; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1116; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1117; X86-AVX512-NEXT: retl 1118; 1119; X64-AVX-LABEL: reg_broadcast_8f32_16f32: 1120; X64-AVX: # %bb.0: 1121; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1122; X64-AVX-NEXT: retq 1123; 1124; X64-AVX512-LABEL: reg_broadcast_8f32_16f32: 1125; X64-AVX512: # %bb.0: 1126; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1127; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1128; X64-AVX512-NEXT: retq 1129 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1130 ret <16 x float> %1 1131} 1132 1133define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind { 1134; X86-LABEL: reg_broadcast_4i32_8i32: 1135; X86: # %bb.0: 1136; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1137; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1138; X86-NEXT: retl 1139; 1140; X64-LABEL: reg_broadcast_4i32_8i32: 1141; X64: # %bb.0: 1142; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1143; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1144; X64-NEXT: retq 1145 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1146 ret <8 x i32> %1 1147} 1148 1149define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind { 1150; X86-AVX-LABEL: reg_broadcast_4i32_16i32: 1151; X86-AVX: # %bb.0: 1152; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1153; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1154; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1155; X86-AVX-NEXT: retl 1156; 1157; X86-AVX512-LABEL: reg_broadcast_4i32_16i32: 1158; X86-AVX512: # %bb.0: 1159; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1160; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1161; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1162; X86-AVX512-NEXT: retl 1163; 1164; X64-AVX-LABEL: reg_broadcast_4i32_16i32: 1165; X64-AVX: # %bb.0: 1166; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1167; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1168; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1169; X64-AVX-NEXT: retq 1170; 1171; X64-AVX512-LABEL: reg_broadcast_4i32_16i32: 1172; X64-AVX512: # %bb.0: 1173; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1174; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1175; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1176; X64-AVX512-NEXT: retq 1177 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1178 ret <16 x i32> %1 1179} 1180 1181define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind { 1182; X86-AVX-LABEL: reg_broadcast_8i32_16i32: 1183; X86-AVX: # %bb.0: 1184; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1185; X86-AVX-NEXT: retl 1186; 1187; X86-AVX512-LABEL: reg_broadcast_8i32_16i32: 1188; X86-AVX512: # %bb.0: 1189; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1190; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1191; X86-AVX512-NEXT: retl 1192; 1193; X64-AVX-LABEL: reg_broadcast_8i32_16i32: 1194; X64-AVX: # %bb.0: 1195; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1196; X64-AVX-NEXT: retq 1197; 1198; X64-AVX512-LABEL: reg_broadcast_8i32_16i32: 1199; X64-AVX512: # %bb.0: 1200; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1201; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1202; X64-AVX512-NEXT: retq 1203 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1204 ret <16 x i32> %1 1205} 1206 1207define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind { 1208; X86-LABEL: reg_broadcast_8i16_16i16: 1209; X86: # %bb.0: 1210; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1211; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1212; X86-NEXT: retl 1213; 1214; X64-LABEL: reg_broadcast_8i16_16i16: 1215; X64: # %bb.0: 1216; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1217; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1218; X64-NEXT: retq 1219 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1220 ret <16 x i16> %1 1221} 1222 1223define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind { 1224; X86-AVX-LABEL: reg_broadcast_8i16_32i16: 1225; X86-AVX: # %bb.0: 1226; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1227; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1228; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1229; X86-AVX-NEXT: retl 1230; 1231; X86-AVX512-LABEL: reg_broadcast_8i16_32i16: 1232; X86-AVX512: # %bb.0: 1233; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1234; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1235; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1236; X86-AVX512-NEXT: retl 1237; 1238; X64-AVX-LABEL: reg_broadcast_8i16_32i16: 1239; X64-AVX: # %bb.0: 1240; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1241; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1242; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1243; X64-AVX-NEXT: retq 1244; 1245; X64-AVX512-LABEL: reg_broadcast_8i16_32i16: 1246; X64-AVX512: # %bb.0: 1247; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1248; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1249; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1250; X64-AVX512-NEXT: retq 1251 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1252 ret <32 x i16> %1 1253} 1254 1255define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind { 1256; X86-AVX-LABEL: reg_broadcast_16i16_32i16: 1257; X86-AVX: # %bb.0: 1258; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1259; X86-AVX-NEXT: retl 1260; 1261; X86-AVX512-LABEL: reg_broadcast_16i16_32i16: 1262; X86-AVX512: # %bb.0: 1263; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1264; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1265; X86-AVX512-NEXT: retl 1266; 1267; X64-AVX-LABEL: reg_broadcast_16i16_32i16: 1268; X64-AVX: # %bb.0: 1269; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1270; X64-AVX-NEXT: retq 1271; 1272; X64-AVX512-LABEL: reg_broadcast_16i16_32i16: 1273; X64-AVX512: # %bb.0: 1274; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1275; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1276; X64-AVX512-NEXT: retq 1277 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1278 ret <32 x i16> %1 1279} 1280 1281define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind { 1282; X86-LABEL: reg_broadcast_16i8_32i8: 1283; X86: # %bb.0: 1284; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1285; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1286; X86-NEXT: retl 1287; 1288; X64-LABEL: reg_broadcast_16i8_32i8: 1289; X64: # %bb.0: 1290; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1291; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1292; X64-NEXT: retq 1293 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1294 ret <32 x i8> %1 1295} 1296 1297define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind { 1298; X86-AVX-LABEL: reg_broadcast_16i8_64i8: 1299; X86-AVX: # %bb.0: 1300; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1301; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1302; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1303; X86-AVX-NEXT: retl 1304; 1305; X86-AVX512-LABEL: reg_broadcast_16i8_64i8: 1306; X86-AVX512: # %bb.0: 1307; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1308; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1309; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1310; X86-AVX512-NEXT: retl 1311; 1312; X64-AVX-LABEL: reg_broadcast_16i8_64i8: 1313; X64-AVX: # %bb.0: 1314; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1315; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1316; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1317; X64-AVX-NEXT: retq 1318; 1319; X64-AVX512-LABEL: reg_broadcast_16i8_64i8: 1320; X64-AVX512: # %bb.0: 1321; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1322; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1323; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1324; X64-AVX512-NEXT: retq 1325 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1326 ret <64 x i8> %1 1327} 1328 1329define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind { 1330; X86-AVX-LABEL: reg_broadcast_32i8_64i8: 1331; X86-AVX: # %bb.0: 1332; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1333; X86-AVX-NEXT: retl 1334; 1335; X86-AVX512-LABEL: reg_broadcast_32i8_64i8: 1336; X86-AVX512: # %bb.0: 1337; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1338; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1339; X86-AVX512-NEXT: retl 1340; 1341; X64-AVX-LABEL: reg_broadcast_32i8_64i8: 1342; X64-AVX: # %bb.0: 1343; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1344; X64-AVX-NEXT: retq 1345; 1346; X64-AVX512-LABEL: reg_broadcast_32i8_64i8: 1347; X64-AVX512: # %bb.0: 1348; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1349; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1350; X64-AVX512-NEXT: retq 1351 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1352 ret <64 x i8> %1 1353} 1354 1355; 1356; PR34394 1357; 1358 1359define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) { 1360; X86-LABEL: test_2xi32_to_4xi32_mem: 1361; X86: # %bb.0: 1362; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1363; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 1364; X86-NEXT: retl 1365; 1366; X64-LABEL: test_2xi32_to_4xi32_mem: 1367; X64: # %bb.0: 1368; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 1369; X64-NEXT: retq 1370 %vec = load <2 x i32>, <2 x i32>* %vp 1371 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1372 ret <4 x i32> %res 1373} 1374 1375define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { 1376; X86-LABEL: test_2xi32_to_8xi32_mem: 1377; X86: # %bb.0: 1378; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1379; X86-NEXT: vbroadcastsd (%eax), %ymm0 1380; X86-NEXT: retl 1381; 1382; X64-LABEL: test_2xi32_to_8xi32_mem: 1383; X64: # %bb.0: 1384; X64-NEXT: vbroadcastsd (%rdi), %ymm0 1385; X64-NEXT: retq 1386 %vec = load <2 x i32>, <2 x i32>* %vp 1387 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1388 ret <8 x i32> %res 1389} 1390 1391define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) { 1392; X86-AVX-LABEL: test_2xi32_to_16xi32_mem: 1393; X86-AVX: # %bb.0: 1394; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1395; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm0 1396; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1397; X86-AVX-NEXT: retl 1398; 1399; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem: 1400; X86-AVX512: # %bb.0: 1401; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1402; X86-AVX512-NEXT: vbroadcastsd (%eax), %zmm0 1403; X86-AVX512-NEXT: retl 1404; 1405; X64-AVX-LABEL: test_2xi32_to_16xi32_mem: 1406; X64-AVX: # %bb.0: 1407; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0 1408; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1409; X64-AVX-NEXT: retq 1410; 1411; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem: 1412; X64-AVX512: # %bb.0: 1413; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 1414; X64-AVX512-NEXT: retq 1415 %vec = load <2 x i32>, <2 x i32>* %vp 1416 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1417 ret <16 x i32> %res 1418} 1419 1420; 1421; PR34041 1422; 1423 1424define <4 x double> @broadcast_v4f64_f64_u000(double* %p) { 1425; X86-LABEL: broadcast_v4f64_f64_u000: 1426; X86: # %bb.0: 1427; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1428; X86-NEXT: vbroadcastsd (%eax), %ymm0 1429; X86-NEXT: retl 1430; 1431; X64-LABEL: broadcast_v4f64_f64_u000: 1432; X64: # %bb.0: 1433; X64-NEXT: vbroadcastsd (%rdi), %ymm0 1434; X64-NEXT: retq 1435 %s = load double, double* %p 1436 %vec = insertelement <2 x double> undef, double %s, i32 0 1437 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1438 ret <4 x double> %res 1439} 1440 1441define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) { 1442; X86-LABEL: broadcast_v4f64_v2f64_4u61: 1443; X86: # %bb.0: 1444; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1445; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 1446; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1447; X86-NEXT: retl 1448; 1449; X64-LABEL: broadcast_v4f64_v2f64_4u61: 1450; X64: # %bb.0: 1451; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 1452; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1453; X64-NEXT: retq 1454 %vec = load <2 x double>, <2 x double>* %vp 1455 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1> 1456 %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default 1457 ret <4 x double> %res 1458} 1459 1460define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) { 1461; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: 1462; X86: # %bb.0: 1463; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1464; X86-NEXT: vbroadcastsd (%eax), %ymm1 1465; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] 1466; X86-NEXT: retl 1467; 1468; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: 1469; X64: # %bb.0: 1470; X64-NEXT: vbroadcastsd (%rdi), %ymm1 1471; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] 1472; X64-NEXT: retq 1473 %vec = load <2 x float>, <2 x float>* %vp 1474 %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef> 1475 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default 1476 ret <8 x float> %res 1477} 1478 1479define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) { 1480; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: 1481; X86-AVX: # %bb.0: 1482; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1483; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1484; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1485; X86-AVX-NEXT: retl 1486; 1487; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: 1488; X86-AVX512: # %bb.0: 1489; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1490; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1491; X86-AVX512-NEXT: retl 1492; 1493; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: 1494; X64-AVX: # %bb.0: 1495; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1496; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1497; X64-AVX-NEXT: retq 1498; 1499; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: 1500; X64-AVX512: # %bb.0: 1501; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1502; X64-AVX512-NEXT: retq 1503 %vec = load <2 x double>, <2 x double>* %vp 1504 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1> 1505 ret <8 x double> %res 1506} 1507 1508define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) { 1509; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1510; X86-AVX: # %bb.0: 1511; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1512; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1513; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1514; X86-AVX-NEXT: retl 1515; 1516; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1517; X86-AVX512: # %bb.0: 1518; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1519; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1520; X86-AVX512-NEXT: retl 1521; 1522; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1523; X64-AVX: # %bb.0: 1524; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1525; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1526; X64-AVX-NEXT: retq 1527; 1528; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1529; X64-AVX512: # %bb.0: 1530; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1531; X64-AVX512-NEXT: retq 1532 %vec = load <2 x double>, <2 x double>* %vp 1533 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1> 1534 ret <8 x double> %res 1535} 1536