1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512F 5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512F 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ 12 13; 14; Subvector Load + Broadcast 15; 16 17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { 18; X32-LABEL: test_broadcast_2f64_4f64: 19; X32: # %bb.0: 20; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 21; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 22; X32-NEXT: retl 23; 24; X64-LABEL: test_broadcast_2f64_4f64: 25; X64: # %bb.0: 26; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 27; X64-NEXT: retq 28 %1 = load <2 x double>, <2 x double> *%p 29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 30 ret <4 x double> %2 31} 32 33define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { 34; X32-AVX-LABEL: test_broadcast_2f64_8f64: 35; X32-AVX: # %bb.0: 36; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 37; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 38; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 39; X32-AVX-NEXT: retl 40; 41; X32-AVX512-LABEL: test_broadcast_2f64_8f64: 42; X32-AVX512: # %bb.0: 43; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 44; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 45; X32-AVX512-NEXT: retl 46; 47; X64-AVX-LABEL: test_broadcast_2f64_8f64: 48; X64-AVX: # %bb.0: 49; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 50; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 51; X64-AVX-NEXT: retq 52; 53; X64-AVX512-LABEL: test_broadcast_2f64_8f64: 54; X64-AVX512: # %bb.0: 55; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 56; X64-AVX512-NEXT: retq 57 %1 = load <2 x double>, <2 x double> *%p 58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 59 ret <8 x double> %2 60} 61 62define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { 63; X32-AVX-LABEL: test_broadcast_4f64_8f64: 64; X32-AVX: # %bb.0: 65; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 66; X32-AVX-NEXT: vmovaps (%eax), %ymm0 67; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 68; X32-AVX-NEXT: retl 69; 70; X32-AVX512-LABEL: test_broadcast_4f64_8f64: 71; X32-AVX512: # %bb.0: 72; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 73; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 74; X32-AVX512-NEXT: retl 75; 76; X64-AVX-LABEL: test_broadcast_4f64_8f64: 77; X64-AVX: # %bb.0: 78; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 79; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 80; X64-AVX-NEXT: retq 81; 82; X64-AVX512-LABEL: test_broadcast_4f64_8f64: 83; X64-AVX512: # %bb.0: 84; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 85; X64-AVX512-NEXT: retq 86 %1 = load <4 x double>, <4 x double> *%p 87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 88 ret <8 x double> %2 89} 90 91define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { 92; X32-AVX-LABEL: test_broadcast_2i64_4i64: 93; X32-AVX: # %bb.0: 94; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 95; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 96; X32-AVX-NEXT: retl 97; 98; X32-AVX512-LABEL: test_broadcast_2i64_4i64: 99; X32-AVX512: # %bb.0: 100; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 101; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 102; X32-AVX512-NEXT: retl 103; 104; X64-AVX-LABEL: test_broadcast_2i64_4i64: 105; X64-AVX: # %bb.0: 106; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 107; X64-AVX-NEXT: retq 108; 109; X64-AVX512-LABEL: test_broadcast_2i64_4i64: 110; X64-AVX512: # %bb.0: 111; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 112; X64-AVX512-NEXT: retq 113 %1 = load <2 x i64>, <2 x i64> *%p 114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 115 ret <4 x i64> %2 116} 117 118define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { 119; X32-AVX-LABEL: test_broadcast_2i64_8i64: 120; X32-AVX: # %bb.0: 121; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 122; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 123; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 124; X32-AVX-NEXT: retl 125; 126; X32-AVX512-LABEL: test_broadcast_2i64_8i64: 127; X32-AVX512: # %bb.0: 128; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 129; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 130; X32-AVX512-NEXT: retl 131; 132; X64-AVX-LABEL: test_broadcast_2i64_8i64: 133; X64-AVX: # %bb.0: 134; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 135; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 136; X64-AVX-NEXT: retq 137; 138; X64-AVX512-LABEL: test_broadcast_2i64_8i64: 139; X64-AVX512: # %bb.0: 140; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 141; X64-AVX512-NEXT: retq 142 %1 = load <2 x i64>, <2 x i64> *%p 143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 144 ret <8 x i64> %2 145} 146 147define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { 148; X32-AVX-LABEL: test_broadcast_4i64_8i64: 149; X32-AVX: # %bb.0: 150; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 151; X32-AVX-NEXT: vmovaps (%eax), %ymm0 152; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 153; X32-AVX-NEXT: retl 154; 155; X32-AVX512-LABEL: test_broadcast_4i64_8i64: 156; X32-AVX512: # %bb.0: 157; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 158; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 159; X32-AVX512-NEXT: retl 160; 161; X64-AVX-LABEL: test_broadcast_4i64_8i64: 162; X64-AVX: # %bb.0: 163; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 164; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 165; X64-AVX-NEXT: retq 166; 167; X64-AVX512-LABEL: test_broadcast_4i64_8i64: 168; X64-AVX512: # %bb.0: 169; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 170; X64-AVX512-NEXT: retq 171 %1 = load <4 x i64>, <4 x i64> *%p 172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 173 ret <8 x i64> %2 174} 175 176define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { 177; X32-LABEL: test_broadcast_4f32_8f32: 178; X32: # %bb.0: 179; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 180; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 181; X32-NEXT: retl 182; 183; X64-LABEL: test_broadcast_4f32_8f32: 184; X64: # %bb.0: 185; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 186; X64-NEXT: retq 187 %1 = load <4 x float>, <4 x float> *%p 188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 189 ret <8 x float> %2 190} 191 192define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { 193; X32-AVX-LABEL: test_broadcast_4f32_16f32: 194; X32-AVX: # %bb.0: 195; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 196; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 197; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 198; X32-AVX-NEXT: retl 199; 200; X32-AVX512-LABEL: test_broadcast_4f32_16f32: 201; X32-AVX512: # %bb.0: 202; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 203; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 204; X32-AVX512-NEXT: retl 205; 206; X64-AVX-LABEL: test_broadcast_4f32_16f32: 207; X64-AVX: # %bb.0: 208; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 209; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 210; X64-AVX-NEXT: retq 211; 212; X64-AVX512-LABEL: test_broadcast_4f32_16f32: 213; X64-AVX512: # %bb.0: 214; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 215; X64-AVX512-NEXT: retq 216 %1 = load <4 x float>, <4 x float> *%p 217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 218 ret <16 x float> %2 219} 220 221define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { 222; X32-AVX-LABEL: test_broadcast_8f32_16f32: 223; X32-AVX: # %bb.0: 224; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 225; X32-AVX-NEXT: vmovaps (%eax), %ymm0 226; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 227; X32-AVX-NEXT: retl 228; 229; X32-AVX512-LABEL: test_broadcast_8f32_16f32: 230; X32-AVX512: # %bb.0: 231; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 232; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 233; X32-AVX512-NEXT: retl 234; 235; X64-AVX-LABEL: test_broadcast_8f32_16f32: 236; X64-AVX: # %bb.0: 237; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 238; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 239; X64-AVX-NEXT: retq 240; 241; X64-AVX512-LABEL: test_broadcast_8f32_16f32: 242; X64-AVX512: # %bb.0: 243; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 244; X64-AVX512-NEXT: retq 245 %1 = load <8 x float>, <8 x float> *%p 246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 247 ret <16 x float> %2 248} 249 250define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { 251; X32-AVX-LABEL: test_broadcast_4i32_8i32: 252; X32-AVX: # %bb.0: 253; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 254; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 255; X32-AVX-NEXT: retl 256; 257; X32-AVX512-LABEL: test_broadcast_4i32_8i32: 258; X32-AVX512: # %bb.0: 259; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 260; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 261; X32-AVX512-NEXT: retl 262; 263; X64-AVX-LABEL: test_broadcast_4i32_8i32: 264; X64-AVX: # %bb.0: 265; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 266; X64-AVX-NEXT: retq 267; 268; X64-AVX512-LABEL: test_broadcast_4i32_8i32: 269; X64-AVX512: # %bb.0: 270; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 271; X64-AVX512-NEXT: retq 272 %1 = load <4 x i32>, <4 x i32> *%p 273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 274 ret <8 x i32> %2 275} 276 277define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { 278; X32-AVX-LABEL: test_broadcast_4i32_16i32: 279; X32-AVX: # %bb.0: 280; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 281; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 282; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 283; X32-AVX-NEXT: retl 284; 285; X32-AVX512-LABEL: test_broadcast_4i32_16i32: 286; X32-AVX512: # %bb.0: 287; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 288; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 289; X32-AVX512-NEXT: retl 290; 291; X64-AVX-LABEL: test_broadcast_4i32_16i32: 292; X64-AVX: # %bb.0: 293; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 294; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 295; X64-AVX-NEXT: retq 296; 297; X64-AVX512-LABEL: test_broadcast_4i32_16i32: 298; X64-AVX512: # %bb.0: 299; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 300; X64-AVX512-NEXT: retq 301 %1 = load <4 x i32>, <4 x i32> *%p 302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 303 ret <16 x i32> %2 304} 305 306define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { 307; X32-AVX-LABEL: test_broadcast_8i32_16i32: 308; X32-AVX: # %bb.0: 309; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 310; X32-AVX-NEXT: vmovaps (%eax), %ymm0 311; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 312; X32-AVX-NEXT: retl 313; 314; X32-AVX512-LABEL: test_broadcast_8i32_16i32: 315; X32-AVX512: # %bb.0: 316; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 317; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 318; X32-AVX512-NEXT: retl 319; 320; X64-AVX-LABEL: test_broadcast_8i32_16i32: 321; X64-AVX: # %bb.0: 322; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 323; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 324; X64-AVX-NEXT: retq 325; 326; X64-AVX512-LABEL: test_broadcast_8i32_16i32: 327; X64-AVX512: # %bb.0: 328; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 329; X64-AVX512-NEXT: retq 330 %1 = load <8 x i32>, <8 x i32> *%p 331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 332 ret <16 x i32> %2 333} 334 335define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { 336; X32-AVX-LABEL: test_broadcast_8i16_16i16: 337; X32-AVX: # %bb.0: 338; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 339; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 340; X32-AVX-NEXT: retl 341; 342; X32-AVX512-LABEL: test_broadcast_8i16_16i16: 343; X32-AVX512: # %bb.0: 344; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 345; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 346; X32-AVX512-NEXT: retl 347; 348; X64-AVX-LABEL: test_broadcast_8i16_16i16: 349; X64-AVX: # %bb.0: 350; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 351; X64-AVX-NEXT: retq 352; 353; X64-AVX512-LABEL: test_broadcast_8i16_16i16: 354; X64-AVX512: # %bb.0: 355; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 356; X64-AVX512-NEXT: retq 357 %1 = load <8 x i16>, <8 x i16> *%p 358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 359 ret <16 x i16> %2 360} 361 362define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { 363; X32-AVX-LABEL: test_broadcast_8i16_32i16: 364; X32-AVX: # %bb.0: 365; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 366; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 367; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 368; X32-AVX-NEXT: retl 369; 370; X32-AVX512F-LABEL: test_broadcast_8i16_32i16: 371; X32-AVX512F: # %bb.0: 372; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 373; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 374; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 375; X32-AVX512F-NEXT: retl 376; 377; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16: 378; X32-AVX512BW: # %bb.0: 379; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 380; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 381; X32-AVX512BW-NEXT: retl 382; 383; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16: 384; X32-AVX512DQ: # %bb.0: 385; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 386; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 387; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 388; X32-AVX512DQ-NEXT: retl 389; 390; X64-AVX-LABEL: test_broadcast_8i16_32i16: 391; X64-AVX: # %bb.0: 392; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 393; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 394; X64-AVX-NEXT: retq 395; 396; X64-AVX512F-LABEL: test_broadcast_8i16_32i16: 397; X64-AVX512F: # %bb.0: 398; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 399; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 400; X64-AVX512F-NEXT: retq 401; 402; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16: 403; X64-AVX512BW: # %bb.0: 404; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 405; X64-AVX512BW-NEXT: retq 406; 407; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16: 408; X64-AVX512DQ: # %bb.0: 409; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 410; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 411; X64-AVX512DQ-NEXT: retq 412 %1 = load <8 x i16>, <8 x i16> *%p 413 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 414 ret <32 x i16> %2 415} 416 417define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { 418; X32-AVX-LABEL: test_broadcast_16i16_32i16: 419; X32-AVX: # %bb.0: 420; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 421; X32-AVX-NEXT: vmovaps (%eax), %ymm0 422; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 423; X32-AVX-NEXT: retl 424; 425; X32-AVX512F-LABEL: test_broadcast_16i16_32i16: 426; X32-AVX512F: # %bb.0: 427; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 428; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0 429; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 430; X32-AVX512F-NEXT: retl 431; 432; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16: 433; X32-AVX512BW: # %bb.0: 434; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 435; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 436; X32-AVX512BW-NEXT: retl 437; 438; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16: 439; X32-AVX512DQ: # %bb.0: 440; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 441; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 442; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 443; X32-AVX512DQ-NEXT: retl 444; 445; X64-AVX-LABEL: test_broadcast_16i16_32i16: 446; X64-AVX: # %bb.0: 447; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 448; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 449; X64-AVX-NEXT: retq 450; 451; X64-AVX512F-LABEL: test_broadcast_16i16_32i16: 452; X64-AVX512F: # %bb.0: 453; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0 454; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 455; X64-AVX512F-NEXT: retq 456; 457; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16: 458; X64-AVX512BW: # %bb.0: 459; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 460; X64-AVX512BW-NEXT: retq 461; 462; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16: 463; X64-AVX512DQ: # %bb.0: 464; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 465; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 466; X64-AVX512DQ-NEXT: retq 467 %1 = load <16 x i16>, <16 x i16> *%p 468 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 469 ret <32 x i16> %2 470} 471 472define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { 473; X32-AVX-LABEL: test_broadcast_16i8_32i8: 474; X32-AVX: # %bb.0: 475; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 476; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 477; X32-AVX-NEXT: retl 478; 479; X32-AVX512-LABEL: test_broadcast_16i8_32i8: 480; X32-AVX512: # %bb.0: 481; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 482; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 483; X32-AVX512-NEXT: retl 484; 485; X64-AVX-LABEL: test_broadcast_16i8_32i8: 486; X64-AVX: # %bb.0: 487; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 488; X64-AVX-NEXT: retq 489; 490; X64-AVX512-LABEL: test_broadcast_16i8_32i8: 491; X64-AVX512: # %bb.0: 492; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 493; X64-AVX512-NEXT: retq 494 %1 = load <16 x i8>, <16 x i8> *%p 495 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 496 ret <32 x i8> %2 497} 498 499define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { 500; X32-AVX-LABEL: test_broadcast_16i8_64i8: 501; X32-AVX: # %bb.0: 502; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 503; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 504; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 505; X32-AVX-NEXT: retl 506; 507; X32-AVX512F-LABEL: test_broadcast_16i8_64i8: 508; X32-AVX512F: # %bb.0: 509; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 510; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 511; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 512; X32-AVX512F-NEXT: retl 513; 514; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8: 515; X32-AVX512BW: # %bb.0: 516; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 517; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 518; X32-AVX512BW-NEXT: retl 519; 520; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8: 521; X32-AVX512DQ: # %bb.0: 522; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 523; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 524; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 525; X32-AVX512DQ-NEXT: retl 526; 527; X64-AVX-LABEL: test_broadcast_16i8_64i8: 528; X64-AVX: # %bb.0: 529; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 530; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 531; X64-AVX-NEXT: retq 532; 533; X64-AVX512F-LABEL: test_broadcast_16i8_64i8: 534; X64-AVX512F: # %bb.0: 535; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 536; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 537; X64-AVX512F-NEXT: retq 538; 539; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8: 540; X64-AVX512BW: # %bb.0: 541; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 542; X64-AVX512BW-NEXT: retq 543; 544; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8: 545; X64-AVX512DQ: # %bb.0: 546; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 547; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 548; X64-AVX512DQ-NEXT: retq 549 %1 = load <16 x i8>, <16 x i8> *%p 550 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 551 ret <64 x i8> %2 552} 553 554define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { 555; X32-AVX-LABEL: test_broadcast_32i8_64i8: 556; X32-AVX: # %bb.0: 557; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 558; X32-AVX-NEXT: vmovaps (%eax), %ymm0 559; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 560; X32-AVX-NEXT: retl 561; 562; X32-AVX512F-LABEL: test_broadcast_32i8_64i8: 563; X32-AVX512F: # %bb.0: 564; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 565; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0 566; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 567; X32-AVX512F-NEXT: retl 568; 569; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8: 570; X32-AVX512BW: # %bb.0: 571; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 572; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 573; X32-AVX512BW-NEXT: retl 574; 575; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8: 576; X32-AVX512DQ: # %bb.0: 577; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 578; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 579; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 580; X32-AVX512DQ-NEXT: retl 581; 582; X64-AVX-LABEL: test_broadcast_32i8_64i8: 583; X64-AVX: # %bb.0: 584; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 585; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 586; X64-AVX-NEXT: retq 587; 588; X64-AVX512F-LABEL: test_broadcast_32i8_64i8: 589; X64-AVX512F: # %bb.0: 590; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0 591; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 592; X64-AVX512F-NEXT: retq 593; 594; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8: 595; X64-AVX512BW: # %bb.0: 596; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 597; X64-AVX512BW-NEXT: retq 598; 599; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8: 600; X64-AVX512DQ: # %bb.0: 601; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 602; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 603; X64-AVX512DQ-NEXT: retq 604 %1 = load <32 x i8>, <32 x i8> *%p 605 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 606 ret <64 x i8> %2 607} 608 609; 610; Subvector Load + Broadcast + Store 611; 612 613define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { 614; X32-LABEL: test_broadcast_2f64_4f64_reuse: 615; X32: # %bb.0: 616; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 617; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 618; X32-NEXT: vmovaps (%ecx), %xmm0 619; X32-NEXT: vmovaps %xmm0, (%eax) 620; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 621; X32-NEXT: retl 622; 623; X64-LABEL: test_broadcast_2f64_4f64_reuse: 624; X64: # %bb.0: 625; X64-NEXT: vmovaps (%rdi), %xmm0 626; X64-NEXT: vmovaps %xmm0, (%rsi) 627; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 628; X64-NEXT: retq 629 %1 = load <2 x double>, <2 x double>* %p0 630 store <2 x double> %1, <2 x double>* %p1 631 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 632 ret <4 x double> %2 633} 634 635define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { 636; X32-LABEL: test_broadcast_2i64_4i64_reuse: 637; X32: # %bb.0: 638; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 639; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 640; X32-NEXT: vmovaps (%ecx), %xmm0 641; X32-NEXT: vmovaps %xmm0, (%eax) 642; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 643; X32-NEXT: retl 644; 645; X64-LABEL: test_broadcast_2i64_4i64_reuse: 646; X64: # %bb.0: 647; X64-NEXT: vmovaps (%rdi), %xmm0 648; X64-NEXT: vmovaps %xmm0, (%rsi) 649; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 650; X64-NEXT: retq 651 %1 = load <2 x i64>, <2 x i64>* %p0 652 store <2 x i64> %1, <2 x i64>* %p1 653 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 654 ret <4 x i64> %2 655} 656 657define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { 658; X32-LABEL: test_broadcast_4f32_8f32_reuse: 659; X32: # %bb.0: 660; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 661; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 662; X32-NEXT: vmovaps (%ecx), %xmm0 663; X32-NEXT: vmovaps %xmm0, (%eax) 664; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 665; X32-NEXT: retl 666; 667; X64-LABEL: test_broadcast_4f32_8f32_reuse: 668; X64: # %bb.0: 669; X64-NEXT: vmovaps (%rdi), %xmm0 670; X64-NEXT: vmovaps %xmm0, (%rsi) 671; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 672; X64-NEXT: retq 673 %1 = load <4 x float>, <4 x float>* %p0 674 store <4 x float> %1, <4 x float>* %p1 675 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 676 ret <8 x float> %2 677} 678 679define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { 680; X32-LABEL: test_broadcast_4i32_8i32_reuse: 681; X32: # %bb.0: 682; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 683; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 684; X32-NEXT: vmovaps (%ecx), %xmm0 685; X32-NEXT: vmovaps %xmm0, (%eax) 686; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 687; X32-NEXT: retl 688; 689; X64-LABEL: test_broadcast_4i32_8i32_reuse: 690; X64: # %bb.0: 691; X64-NEXT: vmovaps (%rdi), %xmm0 692; X64-NEXT: vmovaps %xmm0, (%rsi) 693; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 694; X64-NEXT: retq 695 %1 = load <4 x i32>, <4 x i32>* %p0 696 store <4 x i32> %1, <4 x i32>* %p1 697 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 698 ret <8 x i32> %2 699} 700 701define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { 702; X32-LABEL: test_broadcast_8i16_16i16_reuse: 703; X32: # %bb.0: 704; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 705; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 706; X32-NEXT: vmovaps (%ecx), %xmm0 707; X32-NEXT: vmovaps %xmm0, (%eax) 708; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 709; X32-NEXT: retl 710; 711; X64-LABEL: test_broadcast_8i16_16i16_reuse: 712; X64: # %bb.0: 713; X64-NEXT: vmovaps (%rdi), %xmm0 714; X64-NEXT: vmovaps %xmm0, (%rsi) 715; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 716; X64-NEXT: retq 717 %1 = load <8 x i16>, <8 x i16> *%p0 718 store <8 x i16> %1, <8 x i16>* %p1 719 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 720 ret <16 x i16> %2 721} 722 723define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { 724; X32-LABEL: test_broadcast_16i8_32i8_reuse: 725; X32: # %bb.0: 726; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 727; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 728; X32-NEXT: vmovaps (%ecx), %xmm0 729; X32-NEXT: vmovaps %xmm0, (%eax) 730; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 731; X32-NEXT: retl 732; 733; X64-LABEL: test_broadcast_16i8_32i8_reuse: 734; X64: # %bb.0: 735; X64-NEXT: vmovaps (%rdi), %xmm0 736; X64-NEXT: vmovaps %xmm0, (%rsi) 737; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 738; X64-NEXT: retq 739 %1 = load <16 x i8>, <16 x i8> *%p0 740 store <16 x i8> %1, <16 x i8>* %p1 741 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 742 ret <32 x i8> %2 743} 744 745; 746; Subvector Load + Broadcast with Separate Store 747; 748 749define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 750; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain: 751; X32-AVX: # %bb.0: 752; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 753; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 754; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 755; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 756; X32-AVX-NEXT: vmovaps %xmm1, (%eax) 757; X32-AVX-NEXT: retl 758; 759; X32-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 760; X32-AVX512: # %bb.0: 761; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 762; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 763; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 764; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 765; X32-AVX512-NEXT: vmovaps %xmm1, (%eax) 766; X32-AVX512-NEXT: retl 767; 768; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: 769; X64-AVX: # %bb.0: 770; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 771; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 772; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 773; X64-AVX-NEXT: retq 774; 775; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 776; X64-AVX512: # %bb.0: 777; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 778; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 779; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 780; X64-AVX512-NEXT: retq 781 %1 = load <4 x i32>, <4 x i32>* %p0 782 store <4 x float> zeroinitializer, <4 x float>* %p1 783 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 784 ret <8 x i32> %2 785} 786 787define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 788; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain: 789; X32-AVX: # %bb.0: 790; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 791; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 792; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 793; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 794; X32-AVX-NEXT: vmovaps %xmm1, (%eax) 795; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 796; X32-AVX-NEXT: retl 797; 798; X32-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 799; X32-AVX512: # %bb.0: 800; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 801; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 802; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 803; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 804; X32-AVX512-NEXT: vmovaps %xmm1, (%eax) 805; X32-AVX512-NEXT: retl 806; 807; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: 808; X64-AVX: # %bb.0: 809; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 810; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 811; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 812; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 813; X64-AVX-NEXT: retq 814; 815; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 816; X64-AVX512: # %bb.0: 817; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 818; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 819; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 820; X64-AVX512-NEXT: retq 821 %1 = load <4 x i32>, <4 x i32>* %p0 822 store <4 x float> zeroinitializer, <4 x float>* %p1 823 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 824 ret <16 x i32> %2 825} 826 827; 828; subvector Load with multiple uses + broadcast 829; Fallback to the broadcast should be done 830; 831 832@ga4 = global <4 x i64> zeroinitializer, align 8 833@gb4 = global <8 x i64> zeroinitializer, align 8 834 835define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { 836; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 837; X32-AVX1: # %bb.0: # %entry 838; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 839; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,0,4,0] 840; X32-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 841; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,0,2,0] 842; X32-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 843; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 844; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] 845; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 846; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 847; X32-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 848; X32-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 849; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 850; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 851; X32-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 852; X32-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 853; X32-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 854; X32-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 855; X32-AVX1-NEXT: vmovups %ymm0, ga4 856; X32-AVX1-NEXT: vmovups %ymm2, gb4+32 857; X32-AVX1-NEXT: vmovups %ymm1, gb4 858; X32-AVX1-NEXT: vzeroupper 859; X32-AVX1-NEXT: retl 860; 861; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 862; X32-AVX2: # %bb.0: # %entry 863; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] 864; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 865; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 866; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 867; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 868; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 869; X32-AVX2-NEXT: vmovdqu %ymm0, ga4 870; X32-AVX2-NEXT: vmovdqu %ymm2, gb4+32 871; X32-AVX2-NEXT: vmovdqu %ymm1, gb4 872; X32-AVX2-NEXT: vzeroupper 873; X32-AVX2-NEXT: retl 874; 875; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 876; X32-AVX512: # %bb.0: # %entry 877; X32-AVX512-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0 878; X32-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0] 879; X32-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 880; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 881; X32-AVX512-NEXT: vmovdqu %ymm0, ga4 882; X32-AVX512-NEXT: vmovdqu64 %zmm1, gb4 883; X32-AVX512-NEXT: vzeroupper 884; X32-AVX512-NEXT: retl 885; 886; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 887; X64-AVX1: # %bb.0: # %entry 888; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 889; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4] 890; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 891; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2] 892; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 893; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 894; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,2,3,4] 895; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 896; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 897; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 898; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 899; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 900; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 901; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 902; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 903; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 904; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 905; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip) 906; X64-AVX1-NEXT: vmovups %ymm2, gb4+{{.*}}(%rip) 907; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip) 908; X64-AVX1-NEXT: vzeroupper 909; X64-AVX1-NEXT: retq 910; 911; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 912; X64-AVX2: # %bb.0: # %entry 913; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] 914; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 915; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 916; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 917; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 918; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 919; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip) 920; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+{{.*}}(%rip) 921; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip) 922; X64-AVX2-NEXT: vzeroupper 923; X64-AVX2-NEXT: retq 924; 925; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 926; X64-AVX512: # %bb.0: # %entry 927; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4] 928; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 929; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 930; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 931; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 932; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip) 933; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip) 934; X64-AVX512-NEXT: vzeroupper 935; X64-AVX512-NEXT: retq 936entry: 937 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4> 938 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 939 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 940 store <4 x i64> %0, <4 x i64>* @ga4, align 8 941 store <8 x i64> %2, <8 x i64>* @gb4, align 8 942 ret void 943} 944 945 946@ga2 = global <4 x double> zeroinitializer, align 8 947@gb2 = global <8 x double> zeroinitializer, align 8 948 949define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) { 950; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 951; X32-AVX: # %bb.0: # %entry 952; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 953; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 954; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 955; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 956; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 957; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 958; X32-AVX-NEXT: vmovupd %ymm0, ga2 959; X32-AVX-NEXT: vmovupd %ymm2, gb2+32 960; X32-AVX-NEXT: vmovupd %ymm1, gb2 961; X32-AVX-NEXT: vzeroupper 962; X32-AVX-NEXT: retl 963; 964; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 965; X32-AVX512: # %bb.0: # %entry 966; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 967; X32-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 968; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 969; X32-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 970; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 971; X32-AVX512-NEXT: vmovupd %ymm0, ga2 972; X32-AVX512-NEXT: vmovupd %zmm1, gb2 973; X32-AVX512-NEXT: vzeroupper 974; X32-AVX512-NEXT: retl 975; 976; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 977; X64-AVX: # %bb.0: # %entry 978; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 979; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 980; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 981; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 982; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 983; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 984; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip) 985; X64-AVX-NEXT: vmovupd %ymm2, gb2+{{.*}}(%rip) 986; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip) 987; X64-AVX-NEXT: vzeroupper 988; X64-AVX-NEXT: retq 989; 990; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 991; X64-AVX512: # %bb.0: # %entry 992; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 993; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 994; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 995; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 996; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 997; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip) 998; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip) 999; X64-AVX512-NEXT: vzeroupper 1000; X64-AVX512-NEXT: retq 1001entry: 1002 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0> 1003 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 1004 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 1005 store <4 x double> %0, <4 x double>* @ga2, align 8 1006 store <8 x double> %2, <8 x double>* @gb2, align 8 1007 ret void 1008} 1009 1010; 1011; Subvector Broadcast from register 1012; 1013 1014define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind { 1015; X32-LABEL: reg_broadcast_2f64_4f64: 1016; X32: # %bb.0: 1017; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1018; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1019; X32-NEXT: retl 1020; 1021; X64-LABEL: reg_broadcast_2f64_4f64: 1022; X64: # %bb.0: 1023; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1024; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1025; X64-NEXT: retq 1026 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1027 ret <4 x double> %1 1028} 1029 1030define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind { 1031; X32-AVX-LABEL: reg_broadcast_2f64_8f64: 1032; X32-AVX: # %bb.0: 1033; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1034; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1035; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1036; X32-AVX-NEXT: retl 1037; 1038; X32-AVX512-LABEL: reg_broadcast_2f64_8f64: 1039; X32-AVX512: # %bb.0: 1040; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1041; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1042; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1043; X32-AVX512-NEXT: retl 1044; 1045; X64-AVX-LABEL: reg_broadcast_2f64_8f64: 1046; X64-AVX: # %bb.0: 1047; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1048; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1049; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1050; X64-AVX-NEXT: retq 1051; 1052; X64-AVX512-LABEL: reg_broadcast_2f64_8f64: 1053; X64-AVX512: # %bb.0: 1054; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1055; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1056; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1057; X64-AVX512-NEXT: retq 1058 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1059 ret <8 x double> %1 1060} 1061 1062define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind { 1063; X32-AVX-LABEL: reg_broadcast_4f64_8f64: 1064; X32-AVX: # %bb.0: 1065; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1066; X32-AVX-NEXT: retl 1067; 1068; X32-AVX512-LABEL: reg_broadcast_4f64_8f64: 1069; X32-AVX512: # %bb.0: 1070; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1071; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1072; X32-AVX512-NEXT: retl 1073; 1074; X64-AVX-LABEL: reg_broadcast_4f64_8f64: 1075; X64-AVX: # %bb.0: 1076; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1077; X64-AVX-NEXT: retq 1078; 1079; X64-AVX512-LABEL: reg_broadcast_4f64_8f64: 1080; X64-AVX512: # %bb.0: 1081; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1082; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1083; X64-AVX512-NEXT: retq 1084 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1085 ret <8 x double> %1 1086} 1087 1088define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind { 1089; X32-LABEL: reg_broadcast_2i64_4i64: 1090; X32: # %bb.0: 1091; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1092; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1093; X32-NEXT: retl 1094; 1095; X64-LABEL: reg_broadcast_2i64_4i64: 1096; X64: # %bb.0: 1097; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1098; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1099; X64-NEXT: retq 1100 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1101 ret <4 x i64> %1 1102} 1103 1104define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind { 1105; X32-AVX-LABEL: reg_broadcast_2i64_8i64: 1106; X32-AVX: # %bb.0: 1107; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1108; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1109; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1110; X32-AVX-NEXT: retl 1111; 1112; X32-AVX512-LABEL: reg_broadcast_2i64_8i64: 1113; X32-AVX512: # %bb.0: 1114; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1115; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1116; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1117; X32-AVX512-NEXT: retl 1118; 1119; X64-AVX-LABEL: reg_broadcast_2i64_8i64: 1120; X64-AVX: # %bb.0: 1121; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1122; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1123; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1124; X64-AVX-NEXT: retq 1125; 1126; X64-AVX512-LABEL: reg_broadcast_2i64_8i64: 1127; X64-AVX512: # %bb.0: 1128; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1129; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1130; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1131; X64-AVX512-NEXT: retq 1132 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1133 ret <8 x i64> %1 1134} 1135 1136define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind { 1137; X32-AVX-LABEL: reg_broadcast_4i64_8i64: 1138; X32-AVX: # %bb.0: 1139; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1140; X32-AVX-NEXT: retl 1141; 1142; X32-AVX512-LABEL: reg_broadcast_4i64_8i64: 1143; X32-AVX512: # %bb.0: 1144; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1145; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1146; X32-AVX512-NEXT: retl 1147; 1148; X64-AVX-LABEL: reg_broadcast_4i64_8i64: 1149; X64-AVX: # %bb.0: 1150; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1151; X64-AVX-NEXT: retq 1152; 1153; X64-AVX512-LABEL: reg_broadcast_4i64_8i64: 1154; X64-AVX512: # %bb.0: 1155; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1156; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1157; X64-AVX512-NEXT: retq 1158 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1159 ret <8 x i64> %1 1160} 1161 1162define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind { 1163; X32-LABEL: reg_broadcast_4f32_8f32: 1164; X32: # %bb.0: 1165; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1166; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1167; X32-NEXT: retl 1168; 1169; X64-LABEL: reg_broadcast_4f32_8f32: 1170; X64: # %bb.0: 1171; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1172; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1173; X64-NEXT: retq 1174 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1175 ret <8 x float> %1 1176} 1177 1178define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind { 1179; X32-AVX-LABEL: reg_broadcast_4f32_16f32: 1180; X32-AVX: # %bb.0: 1181; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1182; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1183; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1184; X32-AVX-NEXT: retl 1185; 1186; X32-AVX512-LABEL: reg_broadcast_4f32_16f32: 1187; X32-AVX512: # %bb.0: 1188; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1189; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1190; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1191; X32-AVX512-NEXT: retl 1192; 1193; X64-AVX-LABEL: reg_broadcast_4f32_16f32: 1194; X64-AVX: # %bb.0: 1195; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1196; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1197; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1198; X64-AVX-NEXT: retq 1199; 1200; X64-AVX512-LABEL: reg_broadcast_4f32_16f32: 1201; X64-AVX512: # %bb.0: 1202; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1203; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1204; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1205; X64-AVX512-NEXT: retq 1206 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1207 ret <16 x float> %1 1208} 1209 1210define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind { 1211; X32-AVX-LABEL: reg_broadcast_8f32_16f32: 1212; X32-AVX: # %bb.0: 1213; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1214; X32-AVX-NEXT: retl 1215; 1216; X32-AVX512-LABEL: reg_broadcast_8f32_16f32: 1217; X32-AVX512: # %bb.0: 1218; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1219; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1220; X32-AVX512-NEXT: retl 1221; 1222; X64-AVX-LABEL: reg_broadcast_8f32_16f32: 1223; X64-AVX: # %bb.0: 1224; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1225; X64-AVX-NEXT: retq 1226; 1227; X64-AVX512-LABEL: reg_broadcast_8f32_16f32: 1228; X64-AVX512: # %bb.0: 1229; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1230; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1231; X64-AVX512-NEXT: retq 1232 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1233 ret <16 x float> %1 1234} 1235 1236define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind { 1237; X32-LABEL: reg_broadcast_4i32_8i32: 1238; X32: # %bb.0: 1239; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1240; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1241; X32-NEXT: retl 1242; 1243; X64-LABEL: reg_broadcast_4i32_8i32: 1244; X64: # %bb.0: 1245; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1246; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1247; X64-NEXT: retq 1248 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1249 ret <8 x i32> %1 1250} 1251 1252define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind { 1253; X32-AVX-LABEL: reg_broadcast_4i32_16i32: 1254; X32-AVX: # %bb.0: 1255; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1256; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1257; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1258; X32-AVX-NEXT: retl 1259; 1260; X32-AVX512-LABEL: reg_broadcast_4i32_16i32: 1261; X32-AVX512: # %bb.0: 1262; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1263; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1264; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1265; X32-AVX512-NEXT: retl 1266; 1267; X64-AVX-LABEL: reg_broadcast_4i32_16i32: 1268; X64-AVX: # %bb.0: 1269; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1270; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1271; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1272; X64-AVX-NEXT: retq 1273; 1274; X64-AVX512-LABEL: reg_broadcast_4i32_16i32: 1275; X64-AVX512: # %bb.0: 1276; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1277; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1278; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1279; X64-AVX512-NEXT: retq 1280 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1281 ret <16 x i32> %1 1282} 1283 1284define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind { 1285; X32-AVX-LABEL: reg_broadcast_8i32_16i32: 1286; X32-AVX: # %bb.0: 1287; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1288; X32-AVX-NEXT: retl 1289; 1290; X32-AVX512-LABEL: reg_broadcast_8i32_16i32: 1291; X32-AVX512: # %bb.0: 1292; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1293; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1294; X32-AVX512-NEXT: retl 1295; 1296; X64-AVX-LABEL: reg_broadcast_8i32_16i32: 1297; X64-AVX: # %bb.0: 1298; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1299; X64-AVX-NEXT: retq 1300; 1301; X64-AVX512-LABEL: reg_broadcast_8i32_16i32: 1302; X64-AVX512: # %bb.0: 1303; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1304; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1305; X64-AVX512-NEXT: retq 1306 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1307 ret <16 x i32> %1 1308} 1309 1310define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind { 1311; X32-LABEL: reg_broadcast_8i16_16i16: 1312; X32: # %bb.0: 1313; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1314; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1315; X32-NEXT: retl 1316; 1317; X64-LABEL: reg_broadcast_8i16_16i16: 1318; X64: # %bb.0: 1319; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1320; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1321; X64-NEXT: retq 1322 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1323 ret <16 x i16> %1 1324} 1325 1326define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind { 1327; X32-AVX-LABEL: reg_broadcast_8i16_32i16: 1328; X32-AVX: # %bb.0: 1329; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1330; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1331; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1332; X32-AVX-NEXT: retl 1333; 1334; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16: 1335; X32-AVX512F: # %bb.0: 1336; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1337; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1338; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1339; X32-AVX512F-NEXT: retl 1340; 1341; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16: 1342; X32-AVX512BW: # %bb.0: 1343; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1344; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1345; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1346; X32-AVX512BW-NEXT: retl 1347; 1348; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16: 1349; X32-AVX512DQ: # %bb.0: 1350; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1351; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1352; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1353; X32-AVX512DQ-NEXT: retl 1354; 1355; X64-AVX-LABEL: reg_broadcast_8i16_32i16: 1356; X64-AVX: # %bb.0: 1357; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1358; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1359; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1360; X64-AVX-NEXT: retq 1361; 1362; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16: 1363; X64-AVX512F: # %bb.0: 1364; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1365; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1366; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1367; X64-AVX512F-NEXT: retq 1368; 1369; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16: 1370; X64-AVX512BW: # %bb.0: 1371; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1372; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1373; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1374; X64-AVX512BW-NEXT: retq 1375; 1376; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16: 1377; X64-AVX512DQ: # %bb.0: 1378; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1379; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1380; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1381; X64-AVX512DQ-NEXT: retq 1382 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1383 ret <32 x i16> %1 1384} 1385 1386define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind { 1387; X32-AVX-LABEL: reg_broadcast_16i16_32i16: 1388; X32-AVX: # %bb.0: 1389; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1390; X32-AVX-NEXT: retl 1391; 1392; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16: 1393; X32-AVX512F: # %bb.0: 1394; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1395; X32-AVX512F-NEXT: retl 1396; 1397; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16: 1398; X32-AVX512BW: # %bb.0: 1399; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1400; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1401; X32-AVX512BW-NEXT: retl 1402; 1403; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16: 1404; X32-AVX512DQ: # %bb.0: 1405; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1406; X32-AVX512DQ-NEXT: retl 1407; 1408; X64-AVX-LABEL: reg_broadcast_16i16_32i16: 1409; X64-AVX: # %bb.0: 1410; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1411; X64-AVX-NEXT: retq 1412; 1413; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16: 1414; X64-AVX512F: # %bb.0: 1415; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1416; X64-AVX512F-NEXT: retq 1417; 1418; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16: 1419; X64-AVX512BW: # %bb.0: 1420; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1421; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1422; X64-AVX512BW-NEXT: retq 1423; 1424; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16: 1425; X64-AVX512DQ: # %bb.0: 1426; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1427; X64-AVX512DQ-NEXT: retq 1428 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1429 ret <32 x i16> %1 1430} 1431 1432define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind { 1433; X32-LABEL: reg_broadcast_16i8_32i8: 1434; X32: # %bb.0: 1435; X32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1436; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1437; X32-NEXT: retl 1438; 1439; X64-LABEL: reg_broadcast_16i8_32i8: 1440; X64: # %bb.0: 1441; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1442; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1443; X64-NEXT: retq 1444 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1445 ret <32 x i8> %1 1446} 1447 1448define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind { 1449; X32-AVX-LABEL: reg_broadcast_16i8_64i8: 1450; X32-AVX: # %bb.0: 1451; X32-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1452; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1453; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1454; X32-AVX-NEXT: retl 1455; 1456; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8: 1457; X32-AVX512F: # %bb.0: 1458; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1459; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1460; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1461; X32-AVX512F-NEXT: retl 1462; 1463; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8: 1464; X32-AVX512BW: # %bb.0: 1465; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1466; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1467; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1468; X32-AVX512BW-NEXT: retl 1469; 1470; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8: 1471; X32-AVX512DQ: # %bb.0: 1472; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1473; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1474; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1475; X32-AVX512DQ-NEXT: retl 1476; 1477; X64-AVX-LABEL: reg_broadcast_16i8_64i8: 1478; X64-AVX: # %bb.0: 1479; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1480; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1481; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1482; X64-AVX-NEXT: retq 1483; 1484; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8: 1485; X64-AVX512F: # %bb.0: 1486; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1487; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1488; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1489; X64-AVX512F-NEXT: retq 1490; 1491; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8: 1492; X64-AVX512BW: # %bb.0: 1493; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1494; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1495; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1496; X64-AVX512BW-NEXT: retq 1497; 1498; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8: 1499; X64-AVX512DQ: # %bb.0: 1500; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1501; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1502; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1503; X64-AVX512DQ-NEXT: retq 1504 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1505 ret <64 x i8> %1 1506} 1507 1508define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind { 1509; X32-AVX-LABEL: reg_broadcast_32i8_64i8: 1510; X32-AVX: # %bb.0: 1511; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1512; X32-AVX-NEXT: retl 1513; 1514; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8: 1515; X32-AVX512F: # %bb.0: 1516; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1517; X32-AVX512F-NEXT: retl 1518; 1519; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8: 1520; X32-AVX512BW: # %bb.0: 1521; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1522; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1523; X32-AVX512BW-NEXT: retl 1524; 1525; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8: 1526; X32-AVX512DQ: # %bb.0: 1527; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1528; X32-AVX512DQ-NEXT: retl 1529; 1530; X64-AVX-LABEL: reg_broadcast_32i8_64i8: 1531; X64-AVX: # %bb.0: 1532; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1533; X64-AVX-NEXT: retq 1534; 1535; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8: 1536; X64-AVX512F: # %bb.0: 1537; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 1538; X64-AVX512F-NEXT: retq 1539; 1540; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8: 1541; X64-AVX512BW: # %bb.0: 1542; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1543; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1544; X64-AVX512BW-NEXT: retq 1545; 1546; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8: 1547; X64-AVX512DQ: # %bb.0: 1548; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 1549; X64-AVX512DQ-NEXT: retq 1550 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1551 ret <64 x i8> %1 1552} 1553