1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c 6 7define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 8; CHECK-LABEL: test_mm256_add_pd: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 11; CHECK-NEXT: ret{{[l|q]}} 12 %res = fadd <4 x double> %a0, %a1 13 ret <4 x double> %res 14} 15 16define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 17; CHECK-LABEL: test_mm256_add_ps: 18; CHECK: # %bb.0: 19; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 20; CHECK-NEXT: ret{{[l|q]}} 21 %res = fadd <8 x float> %a0, %a1 22 ret <8 x float> %res 23} 24 25define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 26; CHECK-LABEL: test_mm256_addsub_pd: 27; CHECK: # %bb.0: 28; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 29; CHECK-NEXT: ret{{[l|q]}} 30 %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) 31 ret <4 x double> %res 32} 33declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 34 35define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 36; CHECK-LABEL: test_mm256_addsub_ps: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 39; CHECK-NEXT: ret{{[l|q]}} 40 %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) 41 ret <8 x float> %res 42} 43declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 44 45define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 46; CHECK-LABEL: test_mm256_and_pd: 47; CHECK: # %bb.0: 48; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 49; CHECK-NEXT: ret{{[l|q]}} 50 %1 = bitcast <4 x double> %a0 to <4 x i64> 51 %2 = bitcast <4 x double> %a1 to <4 x i64> 52 %res = and <4 x i64> %1, %2 53 %bc = bitcast <4 x i64> %res to <4 x double> 54 ret <4 x double> %bc 55} 56 57define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 58; CHECK-LABEL: test_mm256_and_ps: 59; CHECK: # %bb.0: 60; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 61; CHECK-NEXT: ret{{[l|q]}} 62 %1 = bitcast <8 x float> %a0 to <8 x i32> 63 %2 = bitcast <8 x float> %a1 to <8 x i32> 64 %res = and <8 x i32> %1, %2 65 %bc = bitcast <8 x i32> %res to <8 x float> 66 ret <8 x float> %bc 67} 68 69define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 70; CHECK-LABEL: test_mm256_andnot_pd: 71; CHECK: # %bb.0: 72; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 73; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 74; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 75; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 76; CHECK-NEXT: ret{{[l|q]}} 77 %1 = bitcast <4 x double> %a0 to <4 x i64> 78 %2 = bitcast <4 x double> %a1 to <4 x i64> 79 %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1> 80 %res = and <4 x i64> %3, %2 81 %bc = bitcast <4 x i64> %res to <4 x double> 82 ret <4 x double> %bc 83} 84 85define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 86; CHECK-LABEL: test_mm256_andnot_ps: 87; CHECK: # %bb.0: 88; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 89; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 90; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 91; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 92; CHECK-NEXT: ret{{[l|q]}} 93 %1 = bitcast <8 x float> %a0 to <8 x i32> 94 %2 = bitcast <8 x float> %a1 to <8 x i32> 95 %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 96 %res = and <8 x i32> %3, %2 97 %bc = bitcast <8 x i32> %res to <8 x float> 98 ret <8 x float> %bc 99} 100 101define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 102; CHECK-LABEL: test_mm256_blend_pd: 103; CHECK: # %bb.0: 104; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] 105; CHECK-NEXT: ret{{[l|q]}} 106 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> 107 ret <4 x double> %res 108} 109 110define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 111; CHECK-LABEL: test_mm256_blend_ps: 112; CHECK: # %bb.0: 113; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7] 114; CHECK-NEXT: ret{{[l|q]}} 115 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15> 116 ret <8 x float> %res 117} 118 119define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind { 120; CHECK-LABEL: test_mm256_blendv_pd: 121; CHECK: # %bb.0: 122; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 123; CHECK-NEXT: ret{{[l|q]}} 124 %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) 125 ret <4 x double> %res 126} 127declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 128 129define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind { 130; CHECK-LABEL: test_mm256_blendv_ps: 131; CHECK: # %bb.0: 132; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 133; CHECK-NEXT: ret{{[l|q]}} 134 %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) 135 ret <8 x float> %res 136} 137declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 138 139define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind { 140; X86-LABEL: test_mm256_broadcast_pd: 141; X86: # %bb.0: 142; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 143; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 144; X86-NEXT: retl 145; 146; X64-LABEL: test_mm256_broadcast_pd: 147; X64: # %bb.0: 148; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 149; X64-NEXT: retq 150 %ld = load <2 x double>, <2 x double>* %a0 151 %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 152 ret <4 x double> %res 153} 154 155define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind { 156; X86-LABEL: test_mm256_broadcast_ps: 157; X86: # %bb.0: 158; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 159; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 160; X86-NEXT: retl 161; 162; X64-LABEL: test_mm256_broadcast_ps: 163; X64: # %bb.0: 164; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 165; X64-NEXT: retq 166 %ld = load <4 x float>, <4 x float>* %a0 167 %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 168 ret <8 x float> %res 169} 170 171define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind { 172; X86-LABEL: test_mm256_broadcast_sd: 173; X86: # %bb.0: 174; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 175; X86-NEXT: vbroadcastsd (%eax), %ymm0 176; X86-NEXT: retl 177; 178; X64-LABEL: test_mm256_broadcast_sd: 179; X64: # %bb.0: 180; X64-NEXT: vbroadcastsd (%rdi), %ymm0 181; X64-NEXT: retq 182 %ld = load double, double* %a0 183 %ins0 = insertelement <4 x double> undef, double %ld, i32 0 184 %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1 185 %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2 186 %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3 187 ret <4 x double> %ins3 188} 189 190define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind { 191; X86-LABEL: test_mm_broadcast_ss: 192; X86: # %bb.0: 193; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 194; X86-NEXT: vbroadcastss (%eax), %xmm0 195; X86-NEXT: retl 196; 197; X64-LABEL: test_mm_broadcast_ss: 198; X64: # %bb.0: 199; X64-NEXT: vbroadcastss (%rdi), %xmm0 200; X64-NEXT: retq 201 %ld = load float, float* %a0 202 %ins0 = insertelement <4 x float> undef, float %ld, i32 0 203 %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1 204 %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2 205 %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3 206 ret <4 x float> %ins3 207} 208 209define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind { 210; X86-LABEL: test_mm256_broadcast_ss: 211; X86: # %bb.0: 212; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 213; X86-NEXT: vbroadcastss (%eax), %ymm0 214; X86-NEXT: retl 215; 216; X64-LABEL: test_mm256_broadcast_ss: 217; X64: # %bb.0: 218; X64-NEXT: vbroadcastss (%rdi), %ymm0 219; X64-NEXT: retq 220 %ld = load float, float* %a0 221 %ins0 = insertelement <8 x float> undef, float %ld, i32 0 222 %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1 223 %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2 224 %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3 225 %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4 226 %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5 227 %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6 228 %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7 229 ret <8 x float> %ins7 230} 231 232define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind { 233; CHECK-LABEL: test_mm256_castpd_ps: 234; CHECK: # %bb.0: 235; CHECK-NEXT: ret{{[l|q]}} 236 %res = bitcast <4 x double> %a0 to <8 x float> 237 ret <8 x float> %res 238} 239 240define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind { 241; CHECK-LABEL: test_mm256_castpd_si256: 242; CHECK: # %bb.0: 243; CHECK-NEXT: ret{{[l|q]}} 244 %res = bitcast <4 x double> %a0 to <4 x i64> 245 ret <4 x i64> %res 246} 247 248define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind { 249; CHECK-LABEL: test_mm256_castpd128_pd256: 250; CHECK: # %bb.0: 251; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 252; CHECK-NEXT: ret{{[l|q]}} 253 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 254 ret <4 x double> %res 255} 256 257define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind { 258; CHECK-LABEL: test_mm256_castpd256_pd128: 259; CHECK: # %bb.0: 260; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 261; CHECK-NEXT: vzeroupper 262; CHECK-NEXT: ret{{[l|q]}} 263 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1> 264 ret <2 x double> %res 265} 266 267define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind { 268; CHECK-LABEL: test_mm256_castps_pd: 269; CHECK: # %bb.0: 270; CHECK-NEXT: ret{{[l|q]}} 271 %res = bitcast <8 x float> %a0 to <4 x double> 272 ret <4 x double> %res 273} 274 275define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind { 276; CHECK-LABEL: test_mm256_castps_si256: 277; CHECK: # %bb.0: 278; CHECK-NEXT: ret{{[l|q]}} 279 %res = bitcast <8 x float> %a0 to <4 x i64> 280 ret <4 x i64> %res 281} 282 283define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind { 284; CHECK-LABEL: test_mm256_castps128_ps256: 285; CHECK: # %bb.0: 286; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 287; CHECK-NEXT: ret{{[l|q]}} 288 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 289 ret <8 x float> %res 290} 291 292define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind { 293; CHECK-LABEL: test_mm256_castps256_ps128: 294; CHECK: # %bb.0: 295; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 296; CHECK-NEXT: vzeroupper 297; CHECK-NEXT: ret{{[l|q]}} 298 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 299 ret <4 x float> %res 300} 301 302define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind { 303; CHECK-LABEL: test_mm256_castsi128_si256: 304; CHECK: # %bb.0: 305; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 306; CHECK-NEXT: ret{{[l|q]}} 307 %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 308 ret <4 x i64> %res 309} 310 311define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind { 312; CHECK-LABEL: test_mm256_castsi256_pd: 313; CHECK: # %bb.0: 314; CHECK-NEXT: ret{{[l|q]}} 315 %res = bitcast <4 x i64> %a0 to <4 x double> 316 ret <4 x double> %res 317} 318 319define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind { 320; CHECK-LABEL: test_mm256_castsi256_ps: 321; CHECK: # %bb.0: 322; CHECK-NEXT: ret{{[l|q]}} 323 %res = bitcast <4 x i64> %a0 to <8 x float> 324 ret <8 x float> %res 325} 326 327define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind { 328; CHECK-LABEL: test_mm256_castsi256_si128: 329; CHECK: # %bb.0: 330; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 331; CHECK-NEXT: vzeroupper 332; CHECK-NEXT: ret{{[l|q]}} 333 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1> 334 ret <2 x i64> %res 335} 336 337define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind { 338; CHECK-LABEL: test_mm256_ceil_pd: 339; CHECK: # %bb.0: 340; CHECK-NEXT: vroundpd $2, %ymm0, %ymm0 341; CHECK-NEXT: ret{{[l|q]}} 342 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2) 343 ret <4 x double> %res 344} 345declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone 346 347define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind { 348; CHECK-LABEL: test_mm256_ceil_ps: 349; CHECK: # %bb.0: 350; CHECK-NEXT: vroundps $2, %ymm0, %ymm0 351; CHECK-NEXT: ret{{[l|q]}} 352 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2) 353 ret <8 x float> %res 354} 355declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone 356 357define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 358; CHECK-LABEL: test_mm_cmp_pd: 359; CHECK: # %bb.0: 360; CHECK-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0 361; CHECK-NEXT: ret{{[l|q]}} 362 %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13) 363 ret <2 x double> %res 364} 365declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone 366 367define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 368; CHECK-LABEL: test_mm256_cmp_pd: 369; CHECK: # %bb.0: 370; CHECK-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0 371; CHECK-NEXT: ret{{[l|q]}} 372 %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13) 373 ret <4 x double> %res 374} 375declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 376 377define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 378; CHECK-LABEL: test_mm_cmp_ps: 379; CHECK: # %bb.0: 380; CHECK-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0 381; CHECK-NEXT: ret{{[l|q]}} 382 %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13) 383 ret <4 x float> %res 384} 385declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone 386 387define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 388; CHECK-LABEL: test_mm256_cmp_ps: 389; CHECK: # %bb.0: 390; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0 391; CHECK-NEXT: ret{{[l|q]}} 392 %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13) 393 ret <8 x float> %res 394} 395declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 396 397define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind { 398; CHECK-LABEL: test_mm_cmp_sd: 399; CHECK: # %bb.0: 400; CHECK-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0 401; CHECK-NEXT: ret{{[l|q]}} 402 %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13) 403 ret <2 x double> %res 404} 405declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone 406 407define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 408; CHECK-LABEL: test_mm_cmp_ss: 409; CHECK: # %bb.0: 410; CHECK-NEXT: vcmpgess %xmm1, %xmm0, %xmm0 411; CHECK-NEXT: ret{{[l|q]}} 412 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13) 413 ret <4 x float> %res 414} 415declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone 416 417define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind { 418; CHECK-LABEL: test_mm256_cvtepi32_pd: 419; CHECK: # %bb.0: 420; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 421; CHECK-NEXT: ret{{[l|q]}} 422 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 423 %res = sitofp <4 x i32> %arg0 to <4 x double> 424 ret <4 x double> %res 425} 426 427define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind { 428; CHECK-LABEL: test_mm256_cvtepi32_ps: 429; CHECK: # %bb.0: 430; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 431; CHECK-NEXT: ret{{[l|q]}} 432 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 433 %res = sitofp <8 x i32> %arg0 to <8 x float> 434 ret <8 x float> %res 435} 436 437define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind { 438; CHECK-LABEL: test_mm256_cvtpd_epi32: 439; CHECK: # %bb.0: 440; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 441; CHECK-NEXT: vzeroupper 442; CHECK-NEXT: ret{{[l|q]}} 443 %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) 444 %res = bitcast <4 x i32> %cvt to <2 x i64> 445 ret <2 x i64> %res 446} 447declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone 448 449define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind { 450; CHECK-LABEL: test_mm256_cvtpd_ps: 451; CHECK: # %bb.0: 452; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 453; CHECK-NEXT: vzeroupper 454; CHECK-NEXT: ret{{[l|q]}} 455 %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) 456 ret <4 x float> %res 457} 458declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone 459 460define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind { 461; CHECK-LABEL: test_mm256_cvtps_epi32: 462; CHECK: # %bb.0: 463; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 464; CHECK-NEXT: ret{{[l|q]}} 465 %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) 466 %res = bitcast <8 x i32> %cvt to <4 x i64> 467 ret <4 x i64> %res 468} 469declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone 470 471define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind { 472; CHECK-LABEL: test_mm256_cvtps_pd: 473; CHECK: # %bb.0: 474; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 475; CHECK-NEXT: ret{{[l|q]}} 476 %res = fpext <4 x float> %a0 to <4 x double> 477 ret <4 x double> %res 478} 479 480define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind { 481; CHECK-LABEL: test_mm256_cvttpd_epi32: 482; CHECK: # %bb.0: 483; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 484; CHECK-NEXT: vzeroupper 485; CHECK-NEXT: ret{{[l|q]}} 486 %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) 487 %res = bitcast <4 x i32> %cvt to <2 x i64> 488 ret <2 x i64> %res 489} 490declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone 491 492define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind { 493; CHECK-LABEL: test_mm256_cvttps_epi32: 494; CHECK: # %bb.0: 495; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 496; CHECK-NEXT: ret{{[l|q]}} 497 %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) 498 %res = bitcast <8 x i32> %cvt to <4 x i64> 499 ret <4 x i64> %res 500} 501declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone 502 503define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 504; CHECK-LABEL: test_mm256_div_pd: 505; CHECK: # %bb.0: 506; CHECK-NEXT: vdivpd %ymm1, %ymm0, %ymm0 507; CHECK-NEXT: ret{{[l|q]}} 508 %res = fdiv <4 x double> %a0, %a1 509 ret <4 x double> %res 510} 511 512define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 513; CHECK-LABEL: test_mm256_div_ps: 514; CHECK: # %bb.0: 515; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 516; CHECK-NEXT: ret{{[l|q]}} 517 %res = fdiv <8 x float> %a0, %a1 518 ret <8 x float> %res 519} 520 521define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 522; CHECK-LABEL: test_mm256_dp_ps: 523; CHECK: # %bb.0: 524; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 525; CHECK-NEXT: ret{{[l|q]}} 526 %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) 527 ret <8 x float> %res 528} 529declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 530 531define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind { 532; CHECK-LABEL: test_mm256_extract_epi8: 533; CHECK: # %bb.0: 534; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 535; CHECK-NEXT: vpextrb $15, %xmm0, %eax 536; CHECK-NEXT: movzbl %al, %eax 537; CHECK-NEXT: vzeroupper 538; CHECK-NEXT: ret{{[l|q]}} 539 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 540 %ext = extractelement <32 x i8> %arg0, i32 31 541 %res = zext i8 %ext to i32 542 ret i32 %res 543} 544 545define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind { 546; CHECK-LABEL: test_mm256_extract_epi16: 547; CHECK: # %bb.0: 548; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 549; CHECK-NEXT: vpextrw $3, %xmm0, %eax 550; CHECK-NEXT: movzwl %ax, %eax 551; CHECK-NEXT: vzeroupper 552; CHECK-NEXT: ret{{[l|q]}} 553 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 554 %ext = extractelement <16 x i16> %arg0, i32 11 555 %res = zext i16 %ext to i32 556 ret i32 %res 557} 558 559define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind { 560; CHECK-LABEL: test_mm256_extract_epi32: 561; CHECK: # %bb.0: 562; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 563; CHECK-NEXT: vextractps $1, %xmm0, %eax 564; CHECK-NEXT: vzeroupper 565; CHECK-NEXT: ret{{[l|q]}} 566 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 567 %res = extractelement <8 x i32> %arg0, i32 5 568 ret i32 %res 569} 570 571define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind { 572; X86-LABEL: test_mm256_extract_epi64: 573; X86: # %bb.0: 574; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 575; X86-NEXT: vextractps $2, %xmm0, %eax 576; X86-NEXT: vextractps $3, %xmm0, %edx 577; X86-NEXT: vzeroupper 578; X86-NEXT: retl 579; 580; X64-LABEL: test_mm256_extract_epi64: 581; X64: # %bb.0: 582; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 583; X64-NEXT: vpextrq $1, %xmm0, %rax 584; X64-NEXT: vzeroupper 585; X64-NEXT: retq 586 %res = extractelement <4 x i64> %a0, i32 3 587 ret i64 %res 588} 589 590define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind { 591; CHECK-LABEL: test_mm256_extractf128_pd: 592; CHECK: # %bb.0: 593; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 594; CHECK-NEXT: vzeroupper 595; CHECK-NEXT: ret{{[l|q]}} 596 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3> 597 ret <2 x double> %res 598} 599 600define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind { 601; CHECK-LABEL: test_mm256_extractf128_ps: 602; CHECK: # %bb.0: 603; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 604; CHECK-NEXT: vzeroupper 605; CHECK-NEXT: ret{{[l|q]}} 606 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 607 ret <4 x float> %res 608} 609 610define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind { 611; CHECK-LABEL: test_mm256_extractf128_si256: 612; CHECK: # %bb.0: 613; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 614; CHECK-NEXT: vzeroupper 615; CHECK-NEXT: ret{{[l|q]}} 616 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 617 ret <2 x i64> %res 618} 619 620define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind { 621; CHECK-LABEL: test_mm256_floor_pd: 622; CHECK: # %bb.0: 623; CHECK-NEXT: vroundpd $1, %ymm0, %ymm0 624; CHECK-NEXT: ret{{[l|q]}} 625 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1) 626 ret <4 x double> %res 627} 628 629define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind { 630; CHECK-LABEL: test_mm256_floor_ps: 631; CHECK: # %bb.0: 632; CHECK-NEXT: vroundps $1, %ymm0, %ymm0 633; CHECK-NEXT: ret{{[l|q]}} 634 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1) 635 ret <8 x float> %res 636} 637 638define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 639; CHECK-LABEL: test_mm256_hadd_pd: 640; CHECK: # %bb.0: 641; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 642; CHECK-NEXT: ret{{[l|q]}} 643 %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) 644 ret <4 x double> %res 645} 646declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone 647 648define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 649; CHECK-LABEL: test_mm256_hadd_ps: 650; CHECK: # %bb.0: 651; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 652; CHECK-NEXT: ret{{[l|q]}} 653 %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) 654 ret <8 x float> %res 655} 656declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone 657 658define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 659; CHECK-LABEL: test_mm256_hsub_pd: 660; CHECK: # %bb.0: 661; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 662; CHECK-NEXT: ret{{[l|q]}} 663 %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) 664 ret <4 x double> %res 665} 666declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 667 668define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 669; CHECK-LABEL: test_mm256_hsub_ps: 670; CHECK: # %bb.0: 671; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0 672; CHECK-NEXT: ret{{[l|q]}} 673 %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) 674 ret <8 x float> %res 675} 676declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 677 678define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind { 679; X86-LABEL: test_mm256_insert_epi8: 680; X86: # %bb.0: 681; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 682; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 683; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 684; X86-NEXT: retl 685; 686; X64-LABEL: test_mm256_insert_epi8: 687; X64: # %bb.0: 688; X64-NEXT: movzbl %dil, %eax 689; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 690; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 691; X64-NEXT: retq 692 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 693 %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4 694 %bc = bitcast <32 x i8> %res to <4 x i64> 695 ret <4 x i64> %bc 696} 697 698define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind { 699; X86-LABEL: test_mm256_insert_epi16: 700; X86: # %bb.0: 701; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 702; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 703; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 704; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 705; X86-NEXT: retl 706; 707; X64-LABEL: test_mm256_insert_epi16: 708; X64: # %bb.0: 709; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 710; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1 711; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 712; X64-NEXT: retq 713 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 714 %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14 715 %bc = bitcast <16 x i16> %res to <4 x i64> 716 ret <4 x i64> %bc 717} 718 719define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind { 720; X86-LABEL: test_mm256_insert_epi32: 721; X86: # %bb.0: 722; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1 723; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 724; X86-NEXT: retl 725; 726; X64-LABEL: test_mm256_insert_epi32: 727; X64: # %bb.0: 728; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1 729; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 730; X64-NEXT: retq 731 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 732 %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3 733 %bc = bitcast <8 x i32> %res to <4 x i64> 734 ret <4 x i64> %bc 735} 736 737define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind { 738; X86-LABEL: test_mm256_insert_epi64: 739; X86: # %bb.0: 740; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 741; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 742; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 743; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 744; X86-NEXT: retl 745; 746; X64-LABEL: test_mm256_insert_epi64: 747; X64: # %bb.0: 748; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 749; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 750; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 751; X64-NEXT: retq 752 %res = insertelement <4 x i64> %a0, i64 %a1, i32 3 753 ret <4 x i64> %res 754} 755 756define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind { 757; CHECK-LABEL: test_mm256_insertf128_pd: 758; CHECK: # %bb.0: 759; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 760; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 761; CHECK-NEXT: ret{{[l|q]}} 762 %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 763 %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 764 ret <4 x double> %res 765} 766 767define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind { 768; CHECK-LABEL: test_mm256_insertf128_ps: 769; CHECK: # %bb.0: 770; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 771; CHECK-NEXT: ret{{[l|q]}} 772 %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 773 %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 774 ret <8 x float> %res 775} 776 777define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 778; CHECK-LABEL: test_mm256_insertf128_si256: 779; CHECK: # %bb.0: 780; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 781; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 782; CHECK-NEXT: ret{{[l|q]}} 783 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 784 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 785 ret <4 x i64> %res 786} 787 788define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind { 789; X86-LABEL: test_mm256_lddqu_si256: 790; X86: # %bb.0: 791; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 792; X86-NEXT: vlddqu (%eax), %ymm0 793; X86-NEXT: retl 794; 795; X64-LABEL: test_mm256_lddqu_si256: 796; X64: # %bb.0: 797; X64-NEXT: vlddqu (%rdi), %ymm0 798; X64-NEXT: retq 799 %arg0 = bitcast <4 x i64>* %a0 to i8* 800 %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0) 801 %bc = bitcast <32 x i8> %res to <4 x i64> 802 ret <4 x i64> %bc 803} 804declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone 805 806define <4 x double> @test_mm256_load_pd(double* %a0) nounwind { 807; X86-LABEL: test_mm256_load_pd: 808; X86: # %bb.0: 809; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 810; X86-NEXT: vmovaps (%eax), %ymm0 811; X86-NEXT: retl 812; 813; X64-LABEL: test_mm256_load_pd: 814; X64: # %bb.0: 815; X64-NEXT: vmovaps (%rdi), %ymm0 816; X64-NEXT: retq 817 %arg0 = bitcast double* %a0 to <4 x double>* 818 %res = load <4 x double>, <4 x double>* %arg0, align 32 819 ret <4 x double> %res 820} 821 822define <8 x float> @test_mm256_load_ps(float* %a0) nounwind { 823; X86-LABEL: test_mm256_load_ps: 824; X86: # %bb.0: 825; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 826; X86-NEXT: vmovaps (%eax), %ymm0 827; X86-NEXT: retl 828; 829; X64-LABEL: test_mm256_load_ps: 830; X64: # %bb.0: 831; X64-NEXT: vmovaps (%rdi), %ymm0 832; X64-NEXT: retq 833 %arg0 = bitcast float* %a0 to <8 x float>* 834 %res = load <8 x float>, <8 x float>* %arg0, align 32 835 ret <8 x float> %res 836} 837 838define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind { 839; X86-LABEL: test_mm256_load_si256: 840; X86: # %bb.0: 841; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 842; X86-NEXT: vmovaps (%eax), %ymm0 843; X86-NEXT: retl 844; 845; X64-LABEL: test_mm256_load_si256: 846; X64: # %bb.0: 847; X64-NEXT: vmovaps (%rdi), %ymm0 848; X64-NEXT: retq 849 %res = load <4 x i64>, <4 x i64>* %a0, align 32 850 ret <4 x i64> %res 851} 852 853define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind { 854; X86-LABEL: test_mm256_loadu_pd: 855; X86: # %bb.0: 856; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 857; X86-NEXT: vmovups (%eax), %ymm0 858; X86-NEXT: retl 859; 860; X64-LABEL: test_mm256_loadu_pd: 861; X64: # %bb.0: 862; X64-NEXT: vmovups (%rdi), %ymm0 863; X64-NEXT: retq 864 %arg0 = bitcast double* %a0 to <4 x double>* 865 %res = load <4 x double>, <4 x double>* %arg0, align 1 866 ret <4 x double> %res 867} 868 869define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind { 870; X86-LABEL: test_mm256_loadu_ps: 871; X86: # %bb.0: 872; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 873; X86-NEXT: vmovups (%eax), %ymm0 874; X86-NEXT: retl 875; 876; X64-LABEL: test_mm256_loadu_ps: 877; X64: # %bb.0: 878; X64-NEXT: vmovups (%rdi), %ymm0 879; X64-NEXT: retq 880 %arg0 = bitcast float* %a0 to <8 x float>* 881 %res = load <8 x float>, <8 x float>* %arg0, align 1 882 ret <8 x float> %res 883} 884 885define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind { 886; X86-LABEL: test_mm256_loadu_si256: 887; X86: # %bb.0: 888; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 889; X86-NEXT: vmovups (%eax), %ymm0 890; X86-NEXT: retl 891; 892; X64-LABEL: test_mm256_loadu_si256: 893; X64: # %bb.0: 894; X64-NEXT: vmovups (%rdi), %ymm0 895; X64-NEXT: retq 896 %res = load <4 x i64>, <4 x i64>* %a0, align 1 897 ret <4 x i64> %res 898} 899 900define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind { 901; X86-LABEL: test_mm256_loadu2_m128: 902; X86: # %bb.0: 903; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 904; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 905; X86-NEXT: vmovups (%eax), %xmm0 906; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 907; X86-NEXT: retl 908; 909; X64-LABEL: test_mm256_loadu2_m128: 910; X64: # %bb.0: 911; X64-NEXT: vmovups (%rsi), %xmm0 912; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 913; X64-NEXT: retq 914 %arg0 = bitcast float* %a0 to <4 x float>* 915 %hi4 = load <4 x float>, <4 x float>* %arg0, align 1 916 %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 917 %arg1 = bitcast float* %a1 to <4 x float>* 918 %lo4 = load <4 x float>, <4 x float>* %arg1, align 1 919 %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 920 %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 921 ret <8 x float> %res 922} 923 924define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind { 925; X86-LABEL: test_mm256_loadu2_m128d: 926; X86: # %bb.0: 927; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 928; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 929; X86-NEXT: vmovups (%eax), %xmm0 930; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 931; X86-NEXT: retl 932; 933; X64-LABEL: test_mm256_loadu2_m128d: 934; X64: # %bb.0: 935; X64-NEXT: vmovups (%rsi), %xmm0 936; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 937; X64-NEXT: retq 938 %arg0 = bitcast double* %a0 to <2 x double>* 939 %hi2 = load <2 x double>, <2 x double>* %arg0, align 1 940 %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 941 %arg1 = bitcast double* %a1 to <2 x double>* 942 %lo2 = load <2 x double>, <2 x double>* %arg1, align 1 943 %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 944 %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 945 ret <4 x double> %res 946} 947 948define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind { 949; X86-LABEL: test_mm256_loadu2_m128i: 950; X86: # %bb.0: 951; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 952; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 953; X86-NEXT: vmovups (%eax), %xmm0 954; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 955; X86-NEXT: retl 956; 957; X64-LABEL: test_mm256_loadu2_m128i: 958; X64: # %bb.0: 959; X64-NEXT: vmovups (%rsi), %xmm0 960; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 961; X64-NEXT: retq 962 %arg0 = bitcast i64* %a0 to <2 x i64>* 963 %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1 964 %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 965 %arg1 = bitcast i64* %a1 to <2 x i64>* 966 %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1 967 %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 968 %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 969 ret <4 x i64> %res 970} 971 972define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind { 973; X86-LABEL: test_mm_maskload_pd: 974; X86: # %bb.0: 975; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 976; X86-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 977; X86-NEXT: retl 978; 979; X64-LABEL: test_mm_maskload_pd: 980; X64: # %bb.0: 981; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0 982; X64-NEXT: retq 983 %arg0 = bitcast double* %a0 to i8* 984 %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1) 985 ret <2 x double> %res 986} 987declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone 988 989define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind { 990; X86-LABEL: test_mm256_maskload_pd: 991; X86: # %bb.0: 992; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 993; X86-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 994; X86-NEXT: retl 995; 996; X64-LABEL: test_mm256_maskload_pd: 997; X64: # %bb.0: 998; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 999; X64-NEXT: retq 1000 %arg0 = bitcast double* %a0 to i8* 1001 %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1) 1002 ret <4 x double> %res 1003} 1004declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone 1005 1006define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind { 1007; X86-LABEL: test_mm_maskload_ps: 1008; X86: # %bb.0: 1009; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1010; X86-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 1011; X86-NEXT: retl 1012; 1013; X64-LABEL: test_mm_maskload_ps: 1014; X64: # %bb.0: 1015; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 1016; X64-NEXT: retq 1017 %arg0 = bitcast float* %a0 to i8* 1018 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1019 %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1) 1020 ret <4 x float> %res 1021} 1022declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone 1023 1024define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind { 1025; X86-LABEL: test_mm256_maskload_ps: 1026; X86: # %bb.0: 1027; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1028; X86-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 1029; X86-NEXT: retl 1030; 1031; X64-LABEL: test_mm256_maskload_ps: 1032; X64: # %bb.0: 1033; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 1034; X64-NEXT: retq 1035 %arg0 = bitcast float* %a0 to i8* 1036 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1037 %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1) 1038 ret <8 x float> %res 1039} 1040declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone 1041 1042define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind { 1043; X86-LABEL: test_mm_maskstore_pd: 1044; X86: # %bb.0: 1045; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1046; X86-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) 1047; X86-NEXT: retl 1048; 1049; X64-LABEL: test_mm_maskstore_pd: 1050; X64: # %bb.0: 1051; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) 1052; X64-NEXT: retq 1053 %arg0 = bitcast double* %a0 to i8* 1054 call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2) 1055 ret void 1056} 1057declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone 1058 1059define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind { 1060; X86-LABEL: test_mm256_maskstore_pd: 1061; X86: # %bb.0: 1062; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1063; X86-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) 1064; X86-NEXT: vzeroupper 1065; X86-NEXT: retl 1066; 1067; X64-LABEL: test_mm256_maskstore_pd: 1068; X64: # %bb.0: 1069; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) 1070; X64-NEXT: vzeroupper 1071; X64-NEXT: retq 1072 %arg0 = bitcast double* %a0 to i8* 1073 call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2) 1074 ret void 1075} 1076declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone 1077 1078define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind { 1079; X86-LABEL: test_mm_maskstore_ps: 1080; X86: # %bb.0: 1081; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1082; X86-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) 1083; X86-NEXT: retl 1084; 1085; X64-LABEL: test_mm_maskstore_ps: 1086; X64: # %bb.0: 1087; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 1088; X64-NEXT: retq 1089 %arg0 = bitcast float* %a0 to i8* 1090 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1091 call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2) 1092 ret void 1093} 1094declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone 1095 1096define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind { 1097; X86-LABEL: test_mm256_maskstore_ps: 1098; X86: # %bb.0: 1099; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1100; X86-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) 1101; X86-NEXT: vzeroupper 1102; X86-NEXT: retl 1103; 1104; X64-LABEL: test_mm256_maskstore_ps: 1105; X64: # %bb.0: 1106; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) 1107; X64-NEXT: vzeroupper 1108; X64-NEXT: retq 1109 %arg0 = bitcast float* %a0 to i8* 1110 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1111 call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2) 1112 ret void 1113} 1114declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone 1115 1116define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1117; CHECK-LABEL: test_mm256_max_pd: 1118; CHECK: # %bb.0: 1119; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 1120; CHECK-NEXT: ret{{[l|q]}} 1121 %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) 1122 ret <4 x double> %res 1123} 1124declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone 1125 1126define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1127; CHECK-LABEL: test_mm256_max_ps: 1128; CHECK: # %bb.0: 1129; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 1130; CHECK-NEXT: ret{{[l|q]}} 1131 %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) 1132 ret <8 x float> %res 1133} 1134declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone 1135 1136define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1137; CHECK-LABEL: test_mm256_min_pd: 1138; CHECK: # %bb.0: 1139; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0 1140; CHECK-NEXT: ret{{[l|q]}} 1141 %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) 1142 ret <4 x double> %res 1143} 1144declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone 1145 1146define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1147; CHECK-LABEL: test_mm256_min_ps: 1148; CHECK: # %bb.0: 1149; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 1150; CHECK-NEXT: ret{{[l|q]}} 1151 %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) 1152 ret <8 x float> %res 1153} 1154declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone 1155 1156define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind { 1157; CHECK-LABEL: test_mm256_movedup_pd: 1158; CHECK: # %bb.0: 1159; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 1160; CHECK-NEXT: ret{{[l|q]}} 1161 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 1162 ret <4 x double> %res 1163} 1164 1165define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind { 1166; CHECK-LABEL: test_mm256_movehdup_ps: 1167; CHECK: # %bb.0: 1168; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 1169; CHECK-NEXT: ret{{[l|q]}} 1170 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 1171 ret <8 x float> %res 1172} 1173 1174define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind { 1175; CHECK-LABEL: test_mm256_moveldup_ps: 1176; CHECK: # %bb.0: 1177; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 1178; CHECK-NEXT: ret{{[l|q]}} 1179 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1180 ret <8 x float> %res 1181} 1182 1183define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind { 1184; CHECK-LABEL: test_mm256_movemask_pd: 1185; CHECK: # %bb.0: 1186; CHECK-NEXT: vmovmskpd %ymm0, %eax 1187; CHECK-NEXT: vzeroupper 1188; CHECK-NEXT: ret{{[l|q]}} 1189 %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) 1190 ret i32 %res 1191} 1192declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone 1193 1194define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind { 1195; CHECK-LABEL: test_mm256_movemask_ps: 1196; CHECK: # %bb.0: 1197; CHECK-NEXT: vmovmskps %ymm0, %eax 1198; CHECK-NEXT: vzeroupper 1199; CHECK-NEXT: ret{{[l|q]}} 1200 %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) 1201 ret i32 %res 1202} 1203declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone 1204 1205define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1206; CHECK-LABEL: test_mm256_mul_pd: 1207; CHECK: # %bb.0: 1208; CHECK-NEXT: vmulpd %ymm1, %ymm0, %ymm0 1209; CHECK-NEXT: ret{{[l|q]}} 1210 %res = fmul <4 x double> %a0, %a1 1211 ret <4 x double> %res 1212} 1213 1214define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1215; CHECK-LABEL: test_mm256_mul_ps: 1216; CHECK: # %bb.0: 1217; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 1218; CHECK-NEXT: ret{{[l|q]}} 1219 %res = fmul <8 x float> %a0, %a1 1220 ret <8 x float> %res 1221} 1222 1223define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1224; CHECK-LABEL: test_mm256_or_pd: 1225; CHECK: # %bb.0: 1226; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1227; CHECK-NEXT: ret{{[l|q]}} 1228 %1 = bitcast <4 x double> %a0 to <4 x i64> 1229 %2 = bitcast <4 x double> %a1 to <4 x i64> 1230 %res = or <4 x i64> %1, %2 1231 %bc = bitcast <4 x i64> %res to <4 x double> 1232 ret <4 x double> %bc 1233} 1234 1235define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1236; CHECK-LABEL: test_mm256_or_ps: 1237; CHECK: # %bb.0: 1238; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1239; CHECK-NEXT: ret{{[l|q]}} 1240 %1 = bitcast <8 x float> %a0 to <8 x i32> 1241 %2 = bitcast <8 x float> %a1 to <8 x i32> 1242 %res = or <8 x i32> %1, %2 1243 %bc = bitcast <8 x i32> %res to <8 x float> 1244 ret <8 x float> %bc 1245} 1246 1247define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind { 1248; CHECK-LABEL: test_mm_permute_pd: 1249; CHECK: # %bb.0: 1250; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 1251; CHECK-NEXT: ret{{[l|q]}} 1252 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0> 1253 ret <2 x double> %res 1254} 1255 1256define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind { 1257; CHECK-LABEL: test_mm256_permute_pd: 1258; CHECK: # %bb.0: 1259; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 1260; CHECK-NEXT: ret{{[l|q]}} 1261 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 1262 ret <4 x double> %res 1263} 1264 1265define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind { 1266; CHECK-LABEL: test_mm_permute_ps: 1267; CHECK: # %bb.0: 1268; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] 1269; CHECK-NEXT: ret{{[l|q]}} 1270 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1271 ret <4 x float> %res 1272} 1273 1274define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind { 1275; CHECK-LABEL: test2_mm_permute_ps: 1276; CHECK: # %bb.0: 1277; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3] 1278; CHECK-NEXT: ret{{[l|q]}} 1279 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3> 1280 ret <4 x float> %res 1281} 1282 1283define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind { 1284; CHECK-LABEL: test_mm256_permute_ps: 1285; CHECK: # %bb.0: 1286; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 1287; CHECK-NEXT: ret{{[l|q]}} 1288 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 1289 ret <8 x float> %res 1290} 1291 1292define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1293; CHECK-LABEL: test_mm256_permute2f128_pd: 1294; CHECK: # %bb.0: 1295; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1] 1296; CHECK-NEXT: ret{{[l|q]}} 1297 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1298 ret <4 x double> %res 1299} 1300declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 1301 1302; PR26667 1303define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1304; CHECK-LABEL: test_mm256_permute2f128_ps: 1305; CHECK: # %bb.0: 1306; CHECK-NEXT: vmovaps %ymm1, %ymm0 1307; CHECK-NEXT: ret{{[l|q]}} 1308 %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 1309 ret <8 x float> %res 1310} 1311declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 1312 1313define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1314; CHECK-LABEL: test_mm256_permute2f128_si256: 1315; CHECK: # %bb.0: 1316; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 1317; CHECK-NEXT: ret{{[l|q]}} 1318 %1 = bitcast <4 x i64> %a0 to <8 x i32> 1319 %2 = bitcast <4 x i64> %a1 to <8 x i32> 1320 %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 1321 %bc = bitcast <8 x i32> %res to <4 x i64> 1322 ret <4 x i64> %bc 1323} 1324declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone 1325 1326define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind { 1327; CHECK-LABEL: test_mm_permutevar_pd: 1328; CHECK: # %bb.0: 1329; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1330; CHECK-NEXT: ret{{[l|q]}} 1331 %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) 1332 ret <2 x double> %res 1333} 1334declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone 1335 1336define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind { 1337; CHECK-LABEL: test_mm256_permutevar_pd: 1338; CHECK: # %bb.0: 1339; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1340; CHECK-NEXT: ret{{[l|q]}} 1341 %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) 1342 ret <4 x double> %res 1343} 1344declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone 1345 1346define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind { 1347; CHECK-LABEL: test_mm_permutevar_ps: 1348; CHECK: # %bb.0: 1349; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1350; CHECK-NEXT: ret{{[l|q]}} 1351 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1352 %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1) 1353 ret <4 x float> %res 1354} 1355declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone 1356 1357define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind { 1358; CHECK-LABEL: test_mm256_permutevar_ps: 1359; CHECK: # %bb.0: 1360; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1361; CHECK-NEXT: ret{{[l|q]}} 1362 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1363 %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1) 1364 ret <8 x float> %res 1365} 1366declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone 1367 1368define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind { 1369; CHECK-LABEL: test_mm256_rcp_ps: 1370; CHECK: # %bb.0: 1371; CHECK-NEXT: vrcpps %ymm0, %ymm0 1372; CHECK-NEXT: ret{{[l|q]}} 1373 %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) 1374 ret <8 x float> %res 1375} 1376declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone 1377 1378define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind { 1379; CHECK-LABEL: test_mm256_round_pd: 1380; CHECK: # %bb.0: 1381; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0 1382; CHECK-NEXT: ret{{[l|q]}} 1383 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4) 1384 ret <4 x double> %res 1385} 1386 1387define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind { 1388; CHECK-LABEL: test_mm256_round_ps: 1389; CHECK: # %bb.0: 1390; CHECK-NEXT: vroundps $4, %ymm0, %ymm0 1391; CHECK-NEXT: ret{{[l|q]}} 1392 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4) 1393 ret <8 x float> %res 1394} 1395 1396define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind { 1397; CHECK-LABEL: test_mm256_rsqrt_ps: 1398; CHECK: # %bb.0: 1399; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 1400; CHECK-NEXT: ret{{[l|q]}} 1401 %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) 1402 ret <8 x float> %res 1403} 1404declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone 1405 1406define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind { 1407; X86-LABEL: test_mm256_set_epi8: 1408; X86: # %bb.0: 1409; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1410; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 1411; X86-NEXT: vmovd %ecx, %xmm0 1412; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 1413; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1414; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 1415; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1416; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 1417; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1418; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 1419; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1420; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 1421; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1422; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 1423; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1424; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 1425; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1426; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 1427; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1428; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 1429; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1430; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 1431; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1432; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 1433; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1434; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 1435; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1436; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 1437; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1438; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 1439; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1440; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 1441; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1442; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 1443; X86-NEXT: vmovd %ecx, %xmm1 1444; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 1445; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1446; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 1447; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1448; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 1449; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1450; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 1451; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1452; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 1453; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1454; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 1455; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1456; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 1457; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1458; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 1459; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1460; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 1461; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1462; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 1463; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1464; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 1465; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1466; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 1467; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1468; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 1469; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1470; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 1471; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1472; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 1473; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1474; X86-NEXT: retl 1475; 1476; X64-LABEL: test_mm256_set_epi8: 1477; X64: # %bb.0: 1478; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d 1479; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1480; X64-NEXT: vmovd %eax, %xmm0 1481; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 1482; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1483; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 1484; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1485; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 1486; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1487; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 1488; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1489; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 1490; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1491; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 1492; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1493; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 1494; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1495; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 1496; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1497; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 1498; X64-NEXT: movzbl %r9b, %eax 1499; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 1500; X64-NEXT: movzbl %r8b, %eax 1501; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 1502; X64-NEXT: movzbl %cl, %eax 1503; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 1504; X64-NEXT: movzbl %dl, %eax 1505; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 1506; X64-NEXT: movzbl %sil, %eax 1507; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 1508; X64-NEXT: movzbl %dil, %eax 1509; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 1510; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1511; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx 1512; X64-NEXT: vmovd %ecx, %xmm1 1513; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 1514; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1515; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 1516; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1517; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 1518; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1519; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 1520; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1521; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 1522; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1523; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 1524; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1525; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 1526; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1527; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 1528; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1529; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 1530; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1531; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 1532; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1533; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 1534; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1535; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 1536; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1537; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 1538; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1539; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 1540; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1541; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 1542; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1543; X64-NEXT: retq 1544 %res0 = insertelement <32 x i8> undef, i8 %a31, i32 0 1545 %res1 = insertelement <32 x i8> %res0, i8 %a30, i32 1 1546 %res2 = insertelement <32 x i8> %res1, i8 %a29, i32 2 1547 %res3 = insertelement <32 x i8> %res2, i8 %a28, i32 3 1548 %res4 = insertelement <32 x i8> %res3, i8 %a27, i32 4 1549 %res5 = insertelement <32 x i8> %res4, i8 %a26, i32 5 1550 %res6 = insertelement <32 x i8> %res5, i8 %a25, i32 6 1551 %res7 = insertelement <32 x i8> %res6, i8 %a24, i32 7 1552 %res8 = insertelement <32 x i8> %res7, i8 %a23, i32 8 1553 %res9 = insertelement <32 x i8> %res8, i8 %a22, i32 9 1554 %res10 = insertelement <32 x i8> %res9, i8 %a21, i32 10 1555 %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11 1556 %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12 1557 %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13 1558 %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14 1559 %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15 1560 %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16 1561 %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17 1562 %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18 1563 %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19 1564 %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20 1565 %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21 1566 %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22 1567 %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23 1568 %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24 1569 %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25 1570 %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26 1571 %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27 1572 %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28 1573 %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29 1574 %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30 1575 %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31 1576 %res = bitcast <32 x i8> %res31 to <4 x i64> 1577 ret <4 x i64> %res 1578} 1579 1580define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { 1581; X86-LABEL: test_mm256_set_epi16: 1582; X86: # %bb.0: 1583; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1584; X86-NEXT: vmovd %eax, %xmm0 1585; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1586; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 1587; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1588; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 1589; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1590; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 1591; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1592; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 1593; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1594; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 1595; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1596; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 1597; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1598; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 1599; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1600; X86-NEXT: vmovd %eax, %xmm1 1601; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1602; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 1603; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1604; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 1605; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1606; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 1607; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1608; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 1609; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1610; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 1611; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1612; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 1613; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1614; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 1615; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1616; X86-NEXT: retl 1617; 1618; X64-LABEL: test_mm256_set_epi16: 1619; X64: # %bb.0: 1620; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1621; X64-NEXT: vmovd %eax, %xmm0 1622; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1623; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 1624; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0 1625; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 1626; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 1627; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 1628; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 1629; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 1630; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1631; X64-NEXT: vmovd %eax, %xmm1 1632; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1633; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 1634; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1635; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 1636; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1637; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 1638; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1639; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 1640; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1641; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 1642; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1643; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 1644; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1645; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 1646; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1647; X64-NEXT: retq 1648 %res0 = insertelement <16 x i16> undef, i16 %a15, i32 0 1649 %res1 = insertelement <16 x i16> %res0, i16 %a14, i32 1 1650 %res2 = insertelement <16 x i16> %res1, i16 %a13, i32 2 1651 %res3 = insertelement <16 x i16> %res2, i16 %a12, i32 3 1652 %res4 = insertelement <16 x i16> %res3, i16 %a11, i32 4 1653 %res5 = insertelement <16 x i16> %res4, i16 %a10, i32 5 1654 %res6 = insertelement <16 x i16> %res5, i16 %a9 , i32 6 1655 %res7 = insertelement <16 x i16> %res6, i16 %a8 , i32 7 1656 %res8 = insertelement <16 x i16> %res7, i16 %a7 , i32 8 1657 %res9 = insertelement <16 x i16> %res8, i16 %a6 , i32 9 1658 %res10 = insertelement <16 x i16> %res9, i16 %a5 , i32 10 1659 %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11 1660 %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12 1661 %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13 1662 %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14 1663 %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15 1664 %res = bitcast <16 x i16> %res15 to <4 x i64> 1665 ret <4 x i64> %res 1666} 1667 1668define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind { 1669; X86-LABEL: test_mm256_set_epi32: 1670; X86: # %bb.0: 1671; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1672; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1673; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 1674; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 1675; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1676; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1677; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 1678; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 1679; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1680; X86-NEXT: retl 1681; 1682; X64-LABEL: test_mm256_set_epi32: 1683; X64: # %bb.0: 1684; X64-NEXT: vmovd %ecx, %xmm0 1685; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 1686; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 1687; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 1688; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1689; X64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 1690; X64-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 1691; X64-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1 1692; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1693; X64-NEXT: retq 1694 %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0 1695 %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1 1696 %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2 1697 %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3 1698 %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4 1699 %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5 1700 %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6 1701 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7 1702 %res = bitcast <8 x i32> %res7 to <4 x i64> 1703 ret <4 x i64> %res 1704} 1705 1706define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind { 1707; X86-LABEL: test_mm256_set_epi64x: 1708; X86: # %bb.0: 1709; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1710; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1711; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 1712; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 1713; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1714; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1715; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 1716; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 1717; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1718; X86-NEXT: retl 1719; 1720; X64-LABEL: test_mm256_set_epi64x: 1721; X64: # %bb.0: 1722; X64-NEXT: vmovq %rdi, %xmm0 1723; X64-NEXT: vmovq %rsi, %xmm1 1724; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1725; X64-NEXT: vmovq %rdx, %xmm1 1726; X64-NEXT: vmovq %rcx, %xmm2 1727; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1728; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1729; X64-NEXT: retq 1730 %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0 1731 %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1 1732 %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2 1733 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3 1734 ret <4 x i64> %res3 1735} 1736 1737define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind { 1738; CHECK-LABEL: test_mm256_set_m128: 1739; CHECK: # %bb.0: 1740; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1741; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1742; CHECK-NEXT: ret{{[l|q]}} 1743 %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1744 ret <8 x float> %res 1745} 1746 1747define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { 1748; CHECK-LABEL: test_mm256_set_m128d: 1749; CHECK: # %bb.0: 1750; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1751; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1752; CHECK-NEXT: ret{{[l|q]}} 1753 %arg0 = bitcast <2 x double> %a0 to <4 x float> 1754 %arg1 = bitcast <2 x double> %a1 to <4 x float> 1755 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1756 %bc = bitcast <8 x float> %res to <4 x double> 1757 ret <4 x double> %bc 1758} 1759 1760define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { 1761; CHECK-LABEL: test_mm256_set_m128i: 1762; CHECK: # %bb.0: 1763; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1764; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1765; CHECK-NEXT: ret{{[l|q]}} 1766 %arg0 = bitcast <2 x i64> %a0 to <4 x float> 1767 %arg1 = bitcast <2 x i64> %a1 to <4 x float> 1768 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1769 %bc = bitcast <8 x float> %res to <4 x i64> 1770 ret <4 x i64> %bc 1771} 1772 1773define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind { 1774; X86-LABEL: test_mm256_set_pd: 1775; X86: # %bb.0: 1776; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1777; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1778; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1779; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1780; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 1781; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1782; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1783; X86-NEXT: retl 1784; 1785; X64-LABEL: test_mm256_set_pd: 1786; X64: # %bb.0: 1787; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1788; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] 1789; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1790; X64-NEXT: retq 1791 %res0 = insertelement <4 x double> undef, double %a3, i32 0 1792 %res1 = insertelement <4 x double> %res0, double %a2, i32 1 1793 %res2 = insertelement <4 x double> %res1, double %a1, i32 2 1794 %res3 = insertelement <4 x double> %res2, double %a0, i32 3 1795 ret <4 x double> %res3 1796} 1797 1798define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind { 1799; X86-LABEL: test_mm256_set_ps: 1800; X86: # %bb.0: 1801; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1802; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1803; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1804; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1805; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1806; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1807; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1808; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1809; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1810; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] 1811; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1812; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1813; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1814; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 1815; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1816; X86-NEXT: retl 1817; 1818; X64-LABEL: test_mm256_set_ps: 1819; X64: # %bb.0: 1820; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 1821; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 1822; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1823; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] 1824; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] 1825; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 1826; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1827; X64-NEXT: retq 1828 %res0 = insertelement <8 x float> undef, float %a7, i32 0 1829 %res1 = insertelement <8 x float> %res0, float %a6, i32 1 1830 %res2 = insertelement <8 x float> %res1, float %a5, i32 2 1831 %res3 = insertelement <8 x float> %res2, float %a4, i32 3 1832 %res4 = insertelement <8 x float> %res3, float %a3, i32 4 1833 %res5 = insertelement <8 x float> %res4, float %a2, i32 5 1834 %res6 = insertelement <8 x float> %res5, float %a1, i32 6 1835 %res7 = insertelement <8 x float> %res6, float %a0, i32 7 1836 ret <8 x float> %res7 1837} 1838 1839define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind { 1840; X86-LABEL: test_mm256_set1_epi8: 1841; X86: # %bb.0: 1842; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1843; X86-NEXT: vmovd %eax, %xmm0 1844; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1845; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1846; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1847; X86-NEXT: retl 1848; 1849; X64-LABEL: test_mm256_set1_epi8: 1850; X64: # %bb.0: 1851; X64-NEXT: movzbl %dil, %eax 1852; X64-NEXT: vmovd %eax, %xmm0 1853; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1854; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1855; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1856; X64-NEXT: retq 1857 %res0 = insertelement <32 x i8> undef, i8 %a0, i32 0 1858 %res1 = insertelement <32 x i8> %res0, i8 %a0, i32 1 1859 %res2 = insertelement <32 x i8> %res1, i8 %a0, i32 2 1860 %res3 = insertelement <32 x i8> %res2, i8 %a0, i32 3 1861 %res4 = insertelement <32 x i8> %res3, i8 %a0, i32 4 1862 %res5 = insertelement <32 x i8> %res4, i8 %a0, i32 5 1863 %res6 = insertelement <32 x i8> %res5, i8 %a0, i32 6 1864 %res7 = insertelement <32 x i8> %res6, i8 %a0, i32 7 1865 %res8 = insertelement <32 x i8> %res7, i8 %a0, i32 8 1866 %res9 = insertelement <32 x i8> %res8, i8 %a0, i32 9 1867 %res10 = insertelement <32 x i8> %res9, i8 %a0, i32 10 1868 %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11 1869 %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12 1870 %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13 1871 %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14 1872 %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15 1873 %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16 1874 %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17 1875 %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18 1876 %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19 1877 %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20 1878 %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21 1879 %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22 1880 %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23 1881 %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24 1882 %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25 1883 %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26 1884 %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27 1885 %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28 1886 %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29 1887 %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30 1888 %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31 1889 %res = bitcast <32 x i8> %res31 to <4 x i64> 1890 ret <4 x i64> %res 1891} 1892 1893define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { 1894; X86-LABEL: test_mm256_set1_epi16: 1895; X86: # %bb.0: 1896; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1897; X86-NEXT: vmovd %eax, %xmm0 1898; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1899; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1900; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1901; X86-NEXT: retl 1902; 1903; X64-LABEL: test_mm256_set1_epi16: 1904; X64: # %bb.0: 1905; X64-NEXT: vmovd %edi, %xmm0 1906; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1907; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1908; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1909; X64-NEXT: retq 1910 %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0 1911 %res1 = insertelement <16 x i16> %res0, i16 %a0, i32 1 1912 %res2 = insertelement <16 x i16> %res1, i16 %a0, i32 2 1913 %res3 = insertelement <16 x i16> %res2, i16 %a0, i32 3 1914 %res4 = insertelement <16 x i16> %res3, i16 %a0, i32 4 1915 %res5 = insertelement <16 x i16> %res4, i16 %a0, i32 5 1916 %res6 = insertelement <16 x i16> %res5, i16 %a0, i32 6 1917 %res7 = insertelement <16 x i16> %res6, i16 %a0, i32 7 1918 %res8 = insertelement <16 x i16> %res7, i16 %a0, i32 8 1919 %res9 = insertelement <16 x i16> %res8, i16 %a0, i32 9 1920 %res10 = insertelement <16 x i16> %res9, i16 %a0, i32 10 1921 %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11 1922 %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12 1923 %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13 1924 %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14 1925 %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15 1926 %res = bitcast <16 x i16> %res15 to <4 x i64> 1927 ret <4 x i64> %res 1928} 1929 1930define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind { 1931; X86-LABEL: test_mm256_set1_epi32: 1932; X86: # %bb.0: 1933; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1934; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1935; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1936; X86-NEXT: retl 1937; 1938; X64-LABEL: test_mm256_set1_epi32: 1939; X64: # %bb.0: 1940; X64-NEXT: vmovd %edi, %xmm0 1941; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1942; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1943; X64-NEXT: retq 1944 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0 1945 %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1 1946 %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2 1947 %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3 1948 %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4 1949 %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5 1950 %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6 1951 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7 1952 %res = bitcast <8 x i32> %res7 to <4 x i64> 1953 ret <4 x i64> %res 1954} 1955 1956define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind { 1957; X86-LABEL: test_mm256_set1_epi64x: 1958; X86: # %bb.0: 1959; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1960; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1961; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1962; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1963; X86-NEXT: retl 1964; 1965; X64-LABEL: test_mm256_set1_epi64x: 1966; X64: # %bb.0: 1967; X64-NEXT: vmovq %rdi, %xmm0 1968; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1969; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1970; X64-NEXT: retq 1971 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 1972 %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1 1973 %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2 1974 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3 1975 ret <4 x i64> %res3 1976} 1977 1978define <4 x double> @test_mm256_set1_pd(double %a0) nounwind { 1979; X86-LABEL: test_mm256_set1_pd: 1980; X86: # %bb.0: 1981; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1982; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1983; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1984; X86-NEXT: retl 1985; 1986; X64-LABEL: test_mm256_set1_pd: 1987; X64: # %bb.0: 1988; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1989; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1990; X64-NEXT: retq 1991 %res0 = insertelement <4 x double> undef, double %a0, i32 0 1992 %res1 = insertelement <4 x double> %res0, double %a0, i32 1 1993 %res2 = insertelement <4 x double> %res1, double %a0, i32 2 1994 %res3 = insertelement <4 x double> %res2, double %a0, i32 3 1995 ret <4 x double> %res3 1996} 1997 1998define <8 x float> @test_mm256_set1_ps(float %a0) nounwind { 1999; X86-LABEL: test_mm256_set1_ps: 2000; X86: # %bb.0: 2001; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2002; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2003; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2004; X86-NEXT: retl 2005; 2006; X64-LABEL: test_mm256_set1_ps: 2007; X64: # %bb.0: 2008; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2009; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2010; X64-NEXT: retq 2011 %res0 = insertelement <8 x float> undef, float %a0, i32 0 2012 %res1 = insertelement <8 x float> %res0, float %a0, i32 1 2013 %res2 = insertelement <8 x float> %res1, float %a0, i32 2 2014 %res3 = insertelement <8 x float> %res2, float %a0, i32 3 2015 %res4 = insertelement <8 x float> %res3, float %a0, i32 4 2016 %res5 = insertelement <8 x float> %res4, float %a0, i32 5 2017 %res6 = insertelement <8 x float> %res5, float %a0, i32 6 2018 %res7 = insertelement <8 x float> %res6, float %a0, i32 7 2019 ret <8 x float> %res7 2020} 2021 2022define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind { 2023; X86-LABEL: test_mm256_setr_epi8: 2024; X86: # %bb.0: 2025; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2026; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 2027; X86-NEXT: vmovd %ecx, %xmm0 2028; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 2029; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2030; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 2031; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2032; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 2033; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2034; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 2035; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2036; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 2037; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2038; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 2039; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2040; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 2041; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2042; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 2043; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2044; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 2045; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2046; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 2047; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2048; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 2049; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2050; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 2051; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2052; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 2053; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2054; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 2055; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2056; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 2057; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2058; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 2059; X86-NEXT: vmovd %ecx, %xmm1 2060; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 2061; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2062; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 2063; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2064; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 2065; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2066; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 2067; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2068; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 2069; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2070; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 2071; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2072; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 2073; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2074; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 2075; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2076; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 2077; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2078; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 2079; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2080; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 2081; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2082; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 2083; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2084; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 2085; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2086; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 2087; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2088; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 2089; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2090; X86-NEXT: retl 2091; 2092; X64-LABEL: test_mm256_setr_epi8: 2093; X64: # %bb.0: 2094; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d 2095; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2096; X64-NEXT: vmovd %eax, %xmm0 2097; X64-NEXT: vpinsrb $1, %r10d, %xmm0, %xmm0 2098; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2099; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 2100; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2101; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 2102; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2103; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 2104; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2105; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 2106; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2107; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 2108; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2109; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 2110; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2111; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 2112; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2113; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 2114; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2115; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 2116; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2117; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 2118; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2119; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 2120; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2121; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 2122; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2123; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 2124; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2125; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 2126; X64-NEXT: movzbl %sil, %eax 2127; X64-NEXT: movzbl %dil, %esi 2128; X64-NEXT: vmovd %esi, %xmm1 2129; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 2130; X64-NEXT: movzbl %dl, %eax 2131; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 2132; X64-NEXT: movzbl %cl, %eax 2133; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 2134; X64-NEXT: movzbl %r8b, %eax 2135; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 2136; X64-NEXT: movzbl %r9b, %eax 2137; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 2138; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2139; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 2140; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2141; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 2142; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2143; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 2144; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2145; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 2146; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2147; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 2148; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2149; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 2150; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2151; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 2152; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2153; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 2154; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2155; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 2156; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2157; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 2158; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2159; X64-NEXT: retq 2160 %res0 = insertelement <32 x i8> undef, i8 %a0 , i32 0 2161 %res1 = insertelement <32 x i8> %res0, i8 %a1 , i32 1 2162 %res2 = insertelement <32 x i8> %res1, i8 %a2 , i32 2 2163 %res3 = insertelement <32 x i8> %res2, i8 %a3 , i32 3 2164 %res4 = insertelement <32 x i8> %res3, i8 %a4 , i32 4 2165 %res5 = insertelement <32 x i8> %res4, i8 %a5 , i32 5 2166 %res6 = insertelement <32 x i8> %res5, i8 %a6 , i32 6 2167 %res7 = insertelement <32 x i8> %res6, i8 %a7 , i32 7 2168 %res8 = insertelement <32 x i8> %res7, i8 %a8 , i32 8 2169 %res9 = insertelement <32 x i8> %res8, i8 %a9 , i32 9 2170 %res10 = insertelement <32 x i8> %res9, i8 %a10, i32 10 2171 %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11 2172 %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12 2173 %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13 2174 %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14 2175 %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15 2176 %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16 2177 %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17 2178 %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18 2179 %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19 2180 %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20 2181 %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21 2182 %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22 2183 %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23 2184 %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24 2185 %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25 2186 %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26 2187 %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27 2188 %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28 2189 %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29 2190 %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30 2191 %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31 2192 %res = bitcast <32 x i8> %res31 to <4 x i64> 2193 ret <4 x i64> %res 2194} 2195 2196define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { 2197; X86-LABEL: test_mm256_setr_epi16: 2198; X86: # %bb.0: 2199; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2200; X86-NEXT: vmovd %eax, %xmm0 2201; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2202; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 2203; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2204; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2205; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2206; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2207; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2208; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 2209; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2210; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2211; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2212; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2213; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2214; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 2215; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2216; X86-NEXT: vmovd %eax, %xmm1 2217; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2218; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 2219; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2220; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 2221; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2222; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 2223; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2224; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 2225; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2226; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 2227; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2228; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 2229; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2230; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 2231; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2232; X86-NEXT: retl 2233; 2234; X64-LABEL: test_mm256_setr_epi16: 2235; X64: # %bb.0: 2236; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2237; X64-NEXT: vmovd %eax, %xmm0 2238; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2239; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 2240; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2241; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2242; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2243; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2244; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2245; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 2246; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2247; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2248; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2249; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2250; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2251; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 2252; X64-NEXT: vmovd %edi, %xmm1 2253; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 2254; X64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 2255; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 2256; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 2257; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 2258; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2259; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 2260; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2261; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 2262; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2263; X64-NEXT: retq 2264 %res0 = insertelement <16 x i16> undef, i16 %a0 , i32 0 2265 %res1 = insertelement <16 x i16> %res0, i16 %a1 , i32 1 2266 %res2 = insertelement <16 x i16> %res1, i16 %a2 , i32 2 2267 %res3 = insertelement <16 x i16> %res2, i16 %a3 , i32 3 2268 %res4 = insertelement <16 x i16> %res3, i16 %a4 , i32 4 2269 %res5 = insertelement <16 x i16> %res4, i16 %a5 , i32 5 2270 %res6 = insertelement <16 x i16> %res5, i16 %a6 , i32 6 2271 %res7 = insertelement <16 x i16> %res6, i16 %a7 , i32 7 2272 %res8 = insertelement <16 x i16> %res7, i16 %a8 , i32 8 2273 %res9 = insertelement <16 x i16> %res8, i16 %a9 , i32 9 2274 %res10 = insertelement <16 x i16> %res9, i16 %a10, i32 10 2275 %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11 2276 %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12 2277 %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13 2278 %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14 2279 %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15 2280 %res = bitcast <16 x i16> %res15 to <4 x i64> 2281 ret <4 x i64> %res 2282} 2283 2284define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind { 2285; X86-LABEL: test_mm256_setr_epi32: 2286; X86: # %bb.0: 2287; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2288; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2289; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 2290; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 2291; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2292; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 2293; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 2294; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 2295; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2296; X86-NEXT: retl 2297; 2298; X64-LABEL: test_mm256_setr_epi32: 2299; X64: # %bb.0: 2300; X64-NEXT: vmovd %r8d, %xmm0 2301; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 2302; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 2303; X64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 2304; X64-NEXT: vmovd %edi, %xmm1 2305; X64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 2306; X64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 2307; X64-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 2308; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2309; X64-NEXT: retq 2310 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0 2311 %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1 2312 %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2 2313 %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3 2314 %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4 2315 %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5 2316 %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6 2317 %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7 2318 %res = bitcast <8 x i32> %res7 to <4 x i64> 2319 ret <4 x i64> %res 2320} 2321 2322define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind { 2323; X86-LABEL: test_mm256_setr_epi64x: 2324; X86: # %bb.0: 2325; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2326; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2327; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 2328; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 2329; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2330; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 2331; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 2332; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 2333; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2334; X86-NEXT: retl 2335; 2336; X64-LABEL: test_mm256_setr_epi64x: 2337; X64: # %bb.0: 2338; X64-NEXT: vmovq %rcx, %xmm0 2339; X64-NEXT: vmovq %rdx, %xmm1 2340; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2341; X64-NEXT: vmovq %rsi, %xmm1 2342; X64-NEXT: vmovq %rdi, %xmm2 2343; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 2344; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2345; X64-NEXT: retq 2346 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 2347 %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1 2348 %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2 2349 %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3 2350 ret <4 x i64> %res3 2351} 2352 2353define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind { 2354; CHECK-LABEL: test_mm256_setr_m128: 2355; CHECK: # %bb.0: 2356; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2357; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2358; CHECK-NEXT: ret{{[l|q]}} 2359 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2360 ret <8 x float> %res 2361} 2362 2363define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { 2364; CHECK-LABEL: test_mm256_setr_m128d: 2365; CHECK: # %bb.0: 2366; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2367; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2368; CHECK-NEXT: ret{{[l|q]}} 2369 %arg0 = bitcast <2 x double> %a0 to <4 x float> 2370 %arg1 = bitcast <2 x double> %a1 to <4 x float> 2371 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2372 %bc = bitcast <8 x float> %res to <4 x double> 2373 ret <4 x double> %bc 2374} 2375 2376define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { 2377; CHECK-LABEL: test_mm256_setr_m128i: 2378; CHECK: # %bb.0: 2379; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2380; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2381; CHECK-NEXT: ret{{[l|q]}} 2382 %arg0 = bitcast <2 x i64> %a0 to <4 x float> 2383 %arg1 = bitcast <2 x i64> %a1 to <4 x float> 2384 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2385 %bc = bitcast <8 x float> %res to <4 x i64> 2386 ret <4 x i64> %bc 2387} 2388 2389define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind { 2390; X86-LABEL: test_mm256_setr_pd: 2391; X86: # %bb.0: 2392; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2393; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2394; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2395; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2396; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 2397; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 2398; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2399; X86-NEXT: retl 2400; 2401; X64-LABEL: test_mm256_setr_pd: 2402; X64: # %bb.0: 2403; X64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2404; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2405; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2406; X64-NEXT: retq 2407 %res0 = insertelement <4 x double> undef, double %a0, i32 0 2408 %res1 = insertelement <4 x double> %res0, double %a1, i32 1 2409 %res2 = insertelement <4 x double> %res1, double %a2, i32 2 2410 %res3 = insertelement <4 x double> %res2, double %a3, i32 3 2411 ret <4 x double> %res3 2412} 2413 2414define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind { 2415; X86-LABEL: test_mm256_setr_ps: 2416; X86: # %bb.0: 2417; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2418; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2419; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2420; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 2421; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2422; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 2423; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2424; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2425; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2426; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 2427; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero 2428; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] 2429; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 2430; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 2431; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2432; X86-NEXT: retl 2433; 2434; X64-LABEL: test_mm256_setr_ps: 2435; X64: # %bb.0: 2436; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] 2437; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] 2438; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] 2439; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 2440; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 2441; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 2442; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 2443; X64-NEXT: retq 2444 %res0 = insertelement <8 x float> undef, float %a0, i32 0 2445 %res1 = insertelement <8 x float> %res0, float %a1, i32 1 2446 %res2 = insertelement <8 x float> %res1, float %a2, i32 2 2447 %res3 = insertelement <8 x float> %res2, float %a3, i32 3 2448 %res4 = insertelement <8 x float> %res3, float %a4, i32 4 2449 %res5 = insertelement <8 x float> %res4, float %a5, i32 5 2450 %res6 = insertelement <8 x float> %res5, float %a6, i32 6 2451 %res7 = insertelement <8 x float> %res6, float %a7, i32 7 2452 ret <8 x float> %res7 2453} 2454 2455define <4 x double> @test_mm256_setzero_pd() nounwind { 2456; CHECK-LABEL: test_mm256_setzero_pd: 2457; CHECK: # %bb.0: 2458; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2459; CHECK-NEXT: ret{{[l|q]}} 2460 ret <4 x double> zeroinitializer 2461} 2462 2463define <8 x float> @test_mm256_setzero_ps() nounwind { 2464; CHECK-LABEL: test_mm256_setzero_ps: 2465; CHECK: # %bb.0: 2466; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2467; CHECK-NEXT: ret{{[l|q]}} 2468 ret <8 x float> zeroinitializer 2469} 2470 2471define <4 x i64> @test_mm256_setzero_si256() nounwind { 2472; CHECK-LABEL: test_mm256_setzero_si256: 2473; CHECK: # %bb.0: 2474; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2475; CHECK-NEXT: ret{{[l|q]}} 2476 ret <4 x i64> zeroinitializer 2477} 2478 2479define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2480; CHECK-LABEL: test_mm256_shuffle_pd: 2481; CHECK: # %bb.0: 2482; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2483; CHECK-NEXT: ret{{[l|q]}} 2484 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2485 ret <4 x double> %res 2486} 2487 2488define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2489; CHECK-LABEL: test_mm256_shuffle_ps: 2490; CHECK: # %bb.0: 2491; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] 2492; CHECK-NEXT: ret{{[l|q]}} 2493 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12> 2494 ret <8 x float> %res 2495} 2496 2497define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind { 2498; CHECK-LABEL: test_mm256_sqrt_pd: 2499; CHECK: # %bb.0: # %entry 2500; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 2501; CHECK-NEXT: ret{{[l|q]}} 2502entry: 2503 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2 2504 ret <4 x double> %0 2505} 2506 2507declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1 2508 2509define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind { 2510; CHECK-LABEL: test_mm256_sqrt_ps: 2511; CHECK: # %bb.0: # %entry 2512; CHECK-NEXT: vsqrtps %ymm0, %ymm0 2513; CHECK-NEXT: ret{{[l|q]}} 2514entry: 2515 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2 2516 ret <8 x float> %0 2517} 2518 2519declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1 2520 2521define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind { 2522; X86-LABEL: test_mm256_store_pd: 2523; X86: # %bb.0: 2524; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2525; X86-NEXT: vmovaps %ymm0, (%eax) 2526; X86-NEXT: vzeroupper 2527; X86-NEXT: retl 2528; 2529; X64-LABEL: test_mm256_store_pd: 2530; X64: # %bb.0: 2531; X64-NEXT: vmovaps %ymm0, (%rdi) 2532; X64-NEXT: vzeroupper 2533; X64-NEXT: retq 2534 %arg0 = bitcast double* %a0 to <4 x double>* 2535 store <4 x double> %a1, <4 x double>* %arg0, align 32 2536 ret void 2537} 2538 2539define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind { 2540; X86-LABEL: test_mm256_store_ps: 2541; X86: # %bb.0: 2542; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2543; X86-NEXT: vmovaps %ymm0, (%eax) 2544; X86-NEXT: vzeroupper 2545; X86-NEXT: retl 2546; 2547; X64-LABEL: test_mm256_store_ps: 2548; X64: # %bb.0: 2549; X64-NEXT: vmovaps %ymm0, (%rdi) 2550; X64-NEXT: vzeroupper 2551; X64-NEXT: retq 2552 %arg0 = bitcast float* %a0 to <8 x float>* 2553 store <8 x float> %a1, <8 x float>* %arg0, align 32 2554 ret void 2555} 2556 2557define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind { 2558; X86-LABEL: test_mm256_store_si256: 2559; X86: # %bb.0: 2560; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2561; X86-NEXT: vmovaps %ymm0, (%eax) 2562; X86-NEXT: vzeroupper 2563; X86-NEXT: retl 2564; 2565; X64-LABEL: test_mm256_store_si256: 2566; X64: # %bb.0: 2567; X64-NEXT: vmovaps %ymm0, (%rdi) 2568; X64-NEXT: vzeroupper 2569; X64-NEXT: retq 2570 store <4 x i64> %a1, <4 x i64>* %a0, align 32 2571 ret void 2572} 2573 2574define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind { 2575; X86-LABEL: test_mm256_storeu_pd: 2576; X86: # %bb.0: 2577; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2578; X86-NEXT: vmovups %ymm0, (%eax) 2579; X86-NEXT: vzeroupper 2580; X86-NEXT: retl 2581; 2582; X64-LABEL: test_mm256_storeu_pd: 2583; X64: # %bb.0: 2584; X64-NEXT: vmovups %ymm0, (%rdi) 2585; X64-NEXT: vzeroupper 2586; X64-NEXT: retq 2587 %arg0 = bitcast double* %a0 to <4 x double>* 2588 store <4 x double> %a1, <4 x double>* %arg0, align 1 2589 ret void 2590} 2591 2592define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind { 2593; X86-LABEL: test_mm256_storeu_ps: 2594; X86: # %bb.0: 2595; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2596; X86-NEXT: vmovups %ymm0, (%eax) 2597; X86-NEXT: vzeroupper 2598; X86-NEXT: retl 2599; 2600; X64-LABEL: test_mm256_storeu_ps: 2601; X64: # %bb.0: 2602; X64-NEXT: vmovups %ymm0, (%rdi) 2603; X64-NEXT: vzeroupper 2604; X64-NEXT: retq 2605 %arg0 = bitcast float* %a0 to <8 x float>* 2606 store <8 x float> %a1, <8 x float>* %arg0, align 1 2607 ret void 2608} 2609 2610define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind { 2611; X86-LABEL: test_mm256_storeu_si256: 2612; X86: # %bb.0: 2613; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2614; X86-NEXT: vmovups %ymm0, (%eax) 2615; X86-NEXT: vzeroupper 2616; X86-NEXT: retl 2617; 2618; X64-LABEL: test_mm256_storeu_si256: 2619; X64: # %bb.0: 2620; X64-NEXT: vmovups %ymm0, (%rdi) 2621; X64-NEXT: vzeroupper 2622; X64-NEXT: retq 2623 store <4 x i64> %a1, <4 x i64>* %a0, align 1 2624 ret void 2625} 2626 2627define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind { 2628; X86-LABEL: test_mm256_storeu2_m128: 2629; X86: # %bb.0: 2630; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2631; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2632; X86-NEXT: vmovups %xmm0, (%ecx) 2633; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2634; X86-NEXT: vmovups %xmm0, (%eax) 2635; X86-NEXT: vzeroupper 2636; X86-NEXT: retl 2637; 2638; X64-LABEL: test_mm256_storeu2_m128: 2639; X64: # %bb.0: 2640; X64-NEXT: vmovups %xmm0, (%rdi) 2641; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2642; X64-NEXT: vmovups %xmm0, (%rsi) 2643; X64-NEXT: vzeroupper 2644; X64-NEXT: retq 2645 %arg0 = bitcast float* %a0 to <4 x float>* 2646 %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2647 store <4 x float> %lo, <4 x float>* %arg0, align 1 2648 %arg1 = bitcast float* %a1 to <4 x float>* 2649 %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2650 store <4 x float> %hi, <4 x float>* %arg1, align 1 2651 ret void 2652} 2653 2654define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind { 2655; X86-LABEL: test_mm256_storeu2_m128d: 2656; X86: # %bb.0: 2657; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2658; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2659; X86-NEXT: vmovups %xmm0, (%ecx) 2660; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2661; X86-NEXT: vmovups %xmm0, (%eax) 2662; X86-NEXT: vzeroupper 2663; X86-NEXT: retl 2664; 2665; X64-LABEL: test_mm256_storeu2_m128d: 2666; X64: # %bb.0: 2667; X64-NEXT: vmovups %xmm0, (%rdi) 2668; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2669; X64-NEXT: vmovups %xmm0, (%rsi) 2670; X64-NEXT: vzeroupper 2671; X64-NEXT: retq 2672 %arg0 = bitcast double* %a0 to <2 x double>* 2673 %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1> 2674 store <2 x double> %lo, <2 x double>* %arg0, align 1 2675 %arg1 = bitcast double* %a1 to <2 x double>* 2676 %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3> 2677 store <2 x double> %hi, <2 x double>* %arg1, align 1 2678 ret void 2679} 2680 2681define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind { 2682; X86-LABEL: test_mm256_storeu2_m128i: 2683; X86: # %bb.0: 2684; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2685; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2686; X86-NEXT: vmovups %xmm0, (%ecx) 2687; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2688; X86-NEXT: vmovups %xmm0, (%eax) 2689; X86-NEXT: vzeroupper 2690; X86-NEXT: retl 2691; 2692; X64-LABEL: test_mm256_storeu2_m128i: 2693; X64: # %bb.0: 2694; X64-NEXT: vmovups %xmm0, (%rdi) 2695; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2696; X64-NEXT: vmovups %xmm0, (%rsi) 2697; X64-NEXT: vzeroupper 2698; X64-NEXT: retq 2699 %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>* 2700 %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1> 2701 store <2 x i64> %lo, <2 x i64>* %arg0, align 1 2702 %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>* 2703 %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3> 2704 store <2 x i64> %hi, <2 x i64>* %arg1, align 1 2705 ret void 2706} 2707 2708define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind { 2709; X86-LABEL: test_mm256_stream_pd: 2710; X86: # %bb.0: 2711; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2712; X86-NEXT: vmovntps %ymm0, (%eax) 2713; X86-NEXT: vzeroupper 2714; X86-NEXT: retl 2715; 2716; X64-LABEL: test_mm256_stream_pd: 2717; X64: # %bb.0: 2718; X64-NEXT: vmovntps %ymm0, (%rdi) 2719; X64-NEXT: vzeroupper 2720; X64-NEXT: retq 2721 %arg0 = bitcast double* %a0 to <4 x double>* 2722 store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0 2723 ret void 2724} 2725 2726define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind { 2727; X86-LABEL: test_mm256_stream_ps: 2728; X86: # %bb.0: 2729; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2730; X86-NEXT: vmovntps %ymm0, (%eax) 2731; X86-NEXT: vzeroupper 2732; X86-NEXT: retl 2733; 2734; X64-LABEL: test_mm256_stream_ps: 2735; X64: # %bb.0: 2736; X64-NEXT: vmovntps %ymm0, (%rdi) 2737; X64-NEXT: vzeroupper 2738; X64-NEXT: retq 2739 %arg0 = bitcast float* %a0 to <8 x float>* 2740 store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0 2741 ret void 2742} 2743 2744define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind { 2745; X86-LABEL: test_mm256_stream_si256: 2746; X86: # %bb.0: 2747; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2748; X86-NEXT: vmovntps %ymm0, (%eax) 2749; X86-NEXT: vzeroupper 2750; X86-NEXT: retl 2751; 2752; X64-LABEL: test_mm256_stream_si256: 2753; X64: # %bb.0: 2754; X64-NEXT: vmovntps %ymm0, (%rdi) 2755; X64-NEXT: vzeroupper 2756; X64-NEXT: retq 2757 store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0 2758 ret void 2759} 2760 2761define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2762; CHECK-LABEL: test_mm256_sub_pd: 2763; CHECK: # %bb.0: 2764; CHECK-NEXT: vsubpd %ymm1, %ymm0, %ymm0 2765; CHECK-NEXT: ret{{[l|q]}} 2766 %res = fsub <4 x double> %a0, %a1 2767 ret <4 x double> %res 2768} 2769 2770define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2771; CHECK-LABEL: test_mm256_sub_ps: 2772; CHECK: # %bb.0: 2773; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 2774; CHECK-NEXT: ret{{[l|q]}} 2775 %res = fsub <8 x float> %a0, %a1 2776 ret <8 x float> %res 2777} 2778 2779define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2780; CHECK-LABEL: test_mm_testc_pd: 2781; CHECK: # %bb.0: 2782; CHECK-NEXT: xorl %eax, %eax 2783; CHECK-NEXT: vtestpd %xmm1, %xmm0 2784; CHECK-NEXT: setb %al 2785; CHECK-NEXT: ret{{[l|q]}} 2786 %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) 2787 ret i32 %res 2788} 2789declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone 2790 2791define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2792; CHECK-LABEL: test_mm256_testc_pd: 2793; CHECK: # %bb.0: 2794; CHECK-NEXT: xorl %eax, %eax 2795; CHECK-NEXT: vtestpd %ymm1, %ymm0 2796; CHECK-NEXT: setb %al 2797; CHECK-NEXT: vzeroupper 2798; CHECK-NEXT: ret{{[l|q]}} 2799 %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) 2800 ret i32 %res 2801} 2802declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone 2803 2804define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2805; CHECK-LABEL: test_mm_testc_ps: 2806; CHECK: # %bb.0: 2807; CHECK-NEXT: xorl %eax, %eax 2808; CHECK-NEXT: vtestps %xmm1, %xmm0 2809; CHECK-NEXT: setb %al 2810; CHECK-NEXT: ret{{[l|q]}} 2811 %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) 2812 ret i32 %res 2813} 2814declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone 2815 2816define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2817; CHECK-LABEL: test_mm256_testc_ps: 2818; CHECK: # %bb.0: 2819; CHECK-NEXT: xorl %eax, %eax 2820; CHECK-NEXT: vtestps %ymm1, %ymm0 2821; CHECK-NEXT: setb %al 2822; CHECK-NEXT: vzeroupper 2823; CHECK-NEXT: ret{{[l|q]}} 2824 %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) 2825 ret i32 %res 2826} 2827declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone 2828 2829define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2830; CHECK-LABEL: test_mm256_testc_si256: 2831; CHECK: # %bb.0: 2832; CHECK-NEXT: xorl %eax, %eax 2833; CHECK-NEXT: vptest %ymm1, %ymm0 2834; CHECK-NEXT: setb %al 2835; CHECK-NEXT: vzeroupper 2836; CHECK-NEXT: ret{{[l|q]}} 2837 %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) 2838 ret i32 %res 2839} 2840declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone 2841 2842define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2843; CHECK-LABEL: test_mm_testnzc_pd: 2844; CHECK: # %bb.0: 2845; CHECK-NEXT: xorl %eax, %eax 2846; CHECK-NEXT: vtestpd %xmm1, %xmm0 2847; CHECK-NEXT: seta %al 2848; CHECK-NEXT: ret{{[l|q]}} 2849 %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1) 2850 ret i32 %res 2851} 2852declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone 2853 2854define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2855; CHECK-LABEL: test_mm256_testnzc_pd: 2856; CHECK: # %bb.0: 2857; CHECK-NEXT: xorl %eax, %eax 2858; CHECK-NEXT: vtestpd %ymm1, %ymm0 2859; CHECK-NEXT: seta %al 2860; CHECK-NEXT: vzeroupper 2861; CHECK-NEXT: ret{{[l|q]}} 2862 %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) 2863 ret i32 %res 2864} 2865declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone 2866 2867define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2868; CHECK-LABEL: test_mm_testnzc_ps: 2869; CHECK: # %bb.0: 2870; CHECK-NEXT: xorl %eax, %eax 2871; CHECK-NEXT: vtestps %xmm1, %xmm0 2872; CHECK-NEXT: seta %al 2873; CHECK-NEXT: ret{{[l|q]}} 2874 %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1) 2875 ret i32 %res 2876} 2877declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone 2878 2879define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2880; CHECK-LABEL: test_mm256_testnzc_ps: 2881; CHECK: # %bb.0: 2882; CHECK-NEXT: xorl %eax, %eax 2883; CHECK-NEXT: vtestps %ymm1, %ymm0 2884; CHECK-NEXT: seta %al 2885; CHECK-NEXT: vzeroupper 2886; CHECK-NEXT: ret{{[l|q]}} 2887 %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) 2888 ret i32 %res 2889} 2890declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone 2891 2892define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2893; CHECK-LABEL: test_mm256_testnzc_si256: 2894; CHECK: # %bb.0: 2895; CHECK-NEXT: xorl %eax, %eax 2896; CHECK-NEXT: vptest %ymm1, %ymm0 2897; CHECK-NEXT: seta %al 2898; CHECK-NEXT: vzeroupper 2899; CHECK-NEXT: ret{{[l|q]}} 2900 %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) 2901 ret i32 %res 2902} 2903declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone 2904 2905define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2906; CHECK-LABEL: test_mm_testz_pd: 2907; CHECK: # %bb.0: 2908; CHECK-NEXT: xorl %eax, %eax 2909; CHECK-NEXT: vtestpd %xmm1, %xmm0 2910; CHECK-NEXT: sete %al 2911; CHECK-NEXT: ret{{[l|q]}} 2912 %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1) 2913 ret i32 %res 2914} 2915declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone 2916 2917define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2918; CHECK-LABEL: test_mm256_testz_pd: 2919; CHECK: # %bb.0: 2920; CHECK-NEXT: xorl %eax, %eax 2921; CHECK-NEXT: vtestpd %ymm1, %ymm0 2922; CHECK-NEXT: sete %al 2923; CHECK-NEXT: vzeroupper 2924; CHECK-NEXT: ret{{[l|q]}} 2925 %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) 2926 ret i32 %res 2927} 2928declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone 2929 2930define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2931; CHECK-LABEL: test_mm_testz_ps: 2932; CHECK: # %bb.0: 2933; CHECK-NEXT: xorl %eax, %eax 2934; CHECK-NEXT: vtestps %xmm1, %xmm0 2935; CHECK-NEXT: sete %al 2936; CHECK-NEXT: ret{{[l|q]}} 2937 %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1) 2938 ret i32 %res 2939} 2940declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone 2941 2942define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2943; CHECK-LABEL: test_mm256_testz_ps: 2944; CHECK: # %bb.0: 2945; CHECK-NEXT: xorl %eax, %eax 2946; CHECK-NEXT: vtestps %ymm1, %ymm0 2947; CHECK-NEXT: sete %al 2948; CHECK-NEXT: vzeroupper 2949; CHECK-NEXT: ret{{[l|q]}} 2950 %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) 2951 ret i32 %res 2952} 2953declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone 2954 2955define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2956; CHECK-LABEL: test_mm256_testz_si256: 2957; CHECK: # %bb.0: 2958; CHECK-NEXT: xorl %eax, %eax 2959; CHECK-NEXT: vptest %ymm1, %ymm0 2960; CHECK-NEXT: sete %al 2961; CHECK-NEXT: vzeroupper 2962; CHECK-NEXT: ret{{[l|q]}} 2963 %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) 2964 ret i32 %res 2965} 2966declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone 2967 2968define <2 x double> @test_mm_undefined_pd() nounwind { 2969; CHECK-LABEL: test_mm_undefined_pd: 2970; CHECK: # %bb.0: 2971; CHECK-NEXT: ret{{[l|q]}} 2972 ret <2 x double> undef 2973} 2974 2975define <4 x double> @test_mm256_undefined_pd() nounwind { 2976; CHECK-LABEL: test_mm256_undefined_pd: 2977; CHECK: # %bb.0: 2978; CHECK-NEXT: ret{{[l|q]}} 2979 ret <4 x double> undef 2980} 2981 2982define <8 x float> @test_mm256_undefined_ps() nounwind { 2983; CHECK-LABEL: test_mm256_undefined_ps: 2984; CHECK: # %bb.0: 2985; CHECK-NEXT: ret{{[l|q]}} 2986 ret <8 x float> undef 2987} 2988 2989define <4 x i64> @test_mm256_undefined_si256() nounwind { 2990; CHECK-LABEL: test_mm256_undefined_si256: 2991; CHECK: # %bb.0: 2992; CHECK-NEXT: ret{{[l|q]}} 2993 ret <4 x i64> undef 2994} 2995 2996define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2997; CHECK-LABEL: test_mm256_unpackhi_pd: 2998; CHECK: # %bb.0: 2999; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 3000; CHECK-NEXT: ret{{[l|q]}} 3001 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 3002 ret <4 x double> %res 3003} 3004 3005define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3006; CHECK-LABEL: test_mm256_unpackhi_ps: 3007; CHECK: # %bb.0: 3008; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 3009; CHECK-NEXT: ret{{[l|q]}} 3010 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 3011 ret <8 x float> %res 3012} 3013 3014define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 3015; CHECK-LABEL: test_mm256_unpacklo_pd: 3016; CHECK: # %bb.0: 3017; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 3018; CHECK-NEXT: ret{{[l|q]}} 3019 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 3020 ret <4 x double> %res 3021} 3022 3023define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3024; CHECK-LABEL: test_mm256_unpacklo_ps: 3025; CHECK: # %bb.0: 3026; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 3027; CHECK-NEXT: ret{{[l|q]}} 3028 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 3029 ret <8 x float> %res 3030} 3031 3032define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 3033; CHECK-LABEL: test_mm256_xor_pd: 3034; CHECK: # %bb.0: 3035; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 3036; CHECK-NEXT: ret{{[l|q]}} 3037 %1 = bitcast <4 x double> %a0 to <4 x i64> 3038 %2 = bitcast <4 x double> %a1 to <4 x i64> 3039 %res = xor <4 x i64> %1, %2 3040 %bc = bitcast <4 x i64> %res to <4 x double> 3041 ret <4 x double> %bc 3042} 3043 3044define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3045; CHECK-LABEL: test_mm256_xor_ps: 3046; CHECK: # %bb.0: 3047; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 3048; CHECK-NEXT: ret{{[l|q]}} 3049 %1 = bitcast <8 x float> %a0 to <8 x i32> 3050 %2 = bitcast <8 x float> %a1 to <8 x i32> 3051 %res = xor <8 x i32> %1, %2 3052 %bc = bitcast <8 x i32> %res to <8 x float> 3053 ret <8 x float> %bc 3054} 3055 3056define void @test_mm256_zeroall() nounwind { 3057; CHECK-LABEL: test_mm256_zeroall: 3058; CHECK: # %bb.0: 3059; CHECK-NEXT: vzeroall 3060; CHECK-NEXT: ret{{[l|q]}} 3061 call void @llvm.x86.avx.vzeroall() 3062 ret void 3063} 3064declare void @llvm.x86.avx.vzeroall() nounwind readnone 3065 3066define void @test_mm256_zeroupper() nounwind { 3067; CHECK-LABEL: test_mm256_zeroupper: 3068; CHECK: # %bb.0: 3069; CHECK-NEXT: vzeroupper 3070; CHECK-NEXT: ret{{[l|q]}} 3071 call void @llvm.x86.avx.vzeroupper() 3072 ret void 3073} 3074declare void @llvm.x86.avx.vzeroupper() nounwind readnone 3075 3076define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind { 3077; CHECK-LABEL: test_mm256_zextpd128_pd256: 3078; CHECK: # %bb.0: 3079; CHECK-NEXT: vmovaps %xmm0, %xmm0 3080; CHECK-NEXT: ret{{[l|q]}} 3081 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3082 ret <4 x double> %res 3083} 3084 3085define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind { 3086; CHECK-LABEL: test_mm256_zextps128_ps256: 3087; CHECK: # %bb.0: 3088; CHECK-NEXT: vmovaps %xmm0, %xmm0 3089; CHECK-NEXT: ret{{[l|q]}} 3090 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3091 ret <8 x float> %res 3092} 3093 3094define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind { 3095; CHECK-LABEL: test_mm256_zextsi128_si256: 3096; CHECK: # %bb.0: 3097; CHECK-NEXT: vmovaps %xmm0, %xmm0 3098; CHECK-NEXT: ret{{[l|q]}} 3099 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3100 ret <4 x i64> %res 3101} 3102 3103!0 = !{i32 1} 3104