1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE 3; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1 4; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512 5; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE 6; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1 7; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 8 9; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse41-builtins.c 10 11define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) { 12; SSE-LABEL: test_mm_blend_epi16: 13; SSE: # %bb.0: 14; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7] 15; SSE-NEXT: ret{{[l|q]}} 16; 17; AVX-LABEL: test_mm_blend_epi16: 18; AVX: # %bb.0: 19; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7] 20; AVX-NEXT: ret{{[l|q]}} 21 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 22 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 23 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7> 24 %res = bitcast <8 x i16> %shuf to <2 x i64> 25 ret <2 x i64> %res 26} 27 28define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) { 29; SSE-LABEL: test_mm_blend_pd: 30; SSE: # %bb.0: 31; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 32; SSE-NEXT: ret{{[l|q]}} 33; 34; AVX-LABEL: test_mm_blend_pd: 35; AVX: # %bb.0: 36; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 37; AVX-NEXT: ret{{[l|q]}} 38 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3> 39 ret <2 x double> %res 40} 41 42define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) { 43; SSE-LABEL: test_mm_blend_ps: 44; SSE: # %bb.0: 45; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 46; SSE-NEXT: ret{{[l|q]}} 47; 48; AVX-LABEL: test_mm_blend_ps: 49; AVX: # %bb.0: 50; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 51; AVX-NEXT: ret{{[l|q]}} 52 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> 53 ret <4 x float> %res 54} 55 56define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { 57; SSE-LABEL: test_mm_blendv_epi8: 58; SSE: # %bb.0: 59; SSE-NEXT: movdqa %xmm0, %xmm3 60; SSE-NEXT: movaps %xmm2, %xmm0 61; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3 62; SSE-NEXT: movdqa %xmm3, %xmm0 63; SSE-NEXT: ret{{[l|q]}} 64; 65; AVX-LABEL: test_mm_blendv_epi8: 66; AVX: # %bb.0: 67; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 68; AVX-NEXT: ret{{[l|q]}} 69 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 70 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 71 %arg2 = bitcast <2 x i64> %a2 to <16 x i8> 72 %call = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2) 73 %res = bitcast <16 x i8> %call to <2 x i64> 74 ret <2 x i64> %res 75} 76declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone 77 78define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { 79; SSE-LABEL: test_mm_blendv_pd: 80; SSE: # %bb.0: 81; SSE-NEXT: movapd %xmm0, %xmm3 82; SSE-NEXT: movaps %xmm2, %xmm0 83; SSE-NEXT: blendvpd %xmm0, %xmm1, %xmm3 84; SSE-NEXT: movapd %xmm3, %xmm0 85; SSE-NEXT: ret{{[l|q]}} 86; 87; AVX-LABEL: test_mm_blendv_pd: 88; AVX: # %bb.0: 89; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 90; AVX-NEXT: ret{{[l|q]}} 91 %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) 92 ret <2 x double> %res 93} 94declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 95 96define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { 97; SSE-LABEL: test_mm_blendv_ps: 98; SSE: # %bb.0: 99; SSE-NEXT: movaps %xmm0, %xmm3 100; SSE-NEXT: movaps %xmm2, %xmm0 101; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3 102; SSE-NEXT: movaps %xmm3, %xmm0 103; SSE-NEXT: ret{{[l|q]}} 104; 105; AVX-LABEL: test_mm_blendv_ps: 106; AVX: # %bb.0: 107; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 108; AVX-NEXT: ret{{[l|q]}} 109 %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) 110 ret <4 x float> %res 111} 112declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 113 114define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) { 115; SSE-LABEL: test_mm_ceil_pd: 116; SSE: # %bb.0: 117; SSE-NEXT: roundpd $2, %xmm0, %xmm0 118; SSE-NEXT: ret{{[l|q]}} 119; 120; AVX-LABEL: test_mm_ceil_pd: 121; AVX: # %bb.0: 122; AVX-NEXT: vroundpd $2, %xmm0, %xmm0 123; AVX-NEXT: ret{{[l|q]}} 124 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2) 125 ret <2 x double> %res 126} 127declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone 128 129define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) { 130; SSE-LABEL: test_mm_ceil_ps: 131; SSE: # %bb.0: 132; SSE-NEXT: roundps $2, %xmm0, %xmm0 133; SSE-NEXT: ret{{[l|q]}} 134; 135; AVX-LABEL: test_mm_ceil_ps: 136; AVX: # %bb.0: 137; AVX-NEXT: vroundps $2, %xmm0, %xmm0 138; AVX-NEXT: ret{{[l|q]}} 139 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2) 140 ret <4 x float> %res 141} 142declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone 143 144define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) { 145; SSE-LABEL: test_mm_ceil_sd: 146; SSE: # %bb.0: 147; SSE-NEXT: roundsd $2, %xmm1, %xmm0 148; SSE-NEXT: ret{{[l|q]}} 149; 150; AVX-LABEL: test_mm_ceil_sd: 151; AVX: # %bb.0: 152; AVX-NEXT: vroundsd $2, %xmm1, %xmm0, %xmm0 153; AVX-NEXT: ret{{[l|q]}} 154 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2) 155 ret <2 x double> %res 156} 157declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone 158 159define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) { 160; SSE-LABEL: test_mm_ceil_ss: 161; SSE: # %bb.0: 162; SSE-NEXT: roundss $2, %xmm1, %xmm0 163; SSE-NEXT: ret{{[l|q]}} 164; 165; AVX-LABEL: test_mm_ceil_ss: 166; AVX: # %bb.0: 167; AVX-NEXT: vroundss $2, %xmm1, %xmm0, %xmm0 168; AVX-NEXT: ret{{[l|q]}} 169 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2) 170 ret <4 x float> %res 171} 172declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone 173 174define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) { 175; SSE-LABEL: test_mm_cmpeq_epi64: 176; SSE: # %bb.0: 177; SSE-NEXT: pcmpeqq %xmm1, %xmm0 178; SSE-NEXT: ret{{[l|q]}} 179; 180; AVX1-LABEL: test_mm_cmpeq_epi64: 181; AVX1: # %bb.0: 182; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 183; AVX1-NEXT: ret{{[l|q]}} 184; 185; AVX512-LABEL: test_mm_cmpeq_epi64: 186; AVX512: # %bb.0: 187; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 188; AVX512-NEXT: vpmovm2q %k0, %xmm0 189; AVX512-NEXT: ret{{[l|q]}} 190 %cmp = icmp eq <2 x i64> %a0, %a1 191 %res = sext <2 x i1> %cmp to <2 x i64> 192 ret <2 x i64> %res 193} 194 195define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) { 196; SSE-LABEL: test_mm_cvtepi8_epi16: 197; SSE: # %bb.0: 198; SSE-NEXT: pmovsxbw %xmm0, %xmm0 199; SSE-NEXT: ret{{[l|q]}} 200; 201; AVX-LABEL: test_mm_cvtepi8_epi16: 202; AVX: # %bb.0: 203; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 204; AVX-NEXT: ret{{[l|q]}} 205 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 206 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 207 %sext = sext <8 x i8> %ext0 to <8 x i16> 208 %res = bitcast <8 x i16> %sext to <2 x i64> 209 ret <2 x i64> %res 210} 211 212define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) { 213; SSE-LABEL: test_mm_cvtepi8_epi32: 214; SSE: # %bb.0: 215; SSE-NEXT: pmovsxbd %xmm0, %xmm0 216; SSE-NEXT: ret{{[l|q]}} 217; 218; AVX-LABEL: test_mm_cvtepi8_epi32: 219; AVX: # %bb.0: 220; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 221; AVX-NEXT: ret{{[l|q]}} 222 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 223 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 224 %sext = sext <4 x i8> %ext0 to <4 x i32> 225 %res = bitcast <4 x i32> %sext to <2 x i64> 226 ret <2 x i64> %res 227} 228 229define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) { 230; SSE-LABEL: test_mm_cvtepi8_epi64: 231; SSE: # %bb.0: 232; SSE-NEXT: pmovsxbq %xmm0, %xmm0 233; SSE-NEXT: ret{{[l|q]}} 234; 235; AVX-LABEL: test_mm_cvtepi8_epi64: 236; AVX: # %bb.0: 237; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 238; AVX-NEXT: ret{{[l|q]}} 239 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 240 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 241 %sext = sext <2 x i8> %ext0 to <2 x i64> 242 ret <2 x i64> %sext 243} 244 245define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) { 246; SSE-LABEL: test_mm_cvtepi16_epi32: 247; SSE: # %bb.0: 248; SSE-NEXT: pmovsxwd %xmm0, %xmm0 249; SSE-NEXT: ret{{[l|q]}} 250; 251; AVX-LABEL: test_mm_cvtepi16_epi32: 252; AVX: # %bb.0: 253; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 254; AVX-NEXT: ret{{[l|q]}} 255 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 256 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 257 %sext = sext <4 x i16> %ext0 to <4 x i32> 258 %res = bitcast <4 x i32> %sext to <2 x i64> 259 ret <2 x i64> %res 260} 261 262define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) { 263; SSE-LABEL: test_mm_cvtepi16_epi64: 264; SSE: # %bb.0: 265; SSE-NEXT: pmovsxwq %xmm0, %xmm0 266; SSE-NEXT: ret{{[l|q]}} 267; 268; AVX-LABEL: test_mm_cvtepi16_epi64: 269; AVX: # %bb.0: 270; AVX-NEXT: vpmovsxwq %xmm0, %xmm0 271; AVX-NEXT: ret{{[l|q]}} 272 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 273 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 274 %sext = sext <2 x i16> %ext0 to <2 x i64> 275 ret <2 x i64> %sext 276} 277 278define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) { 279; SSE-LABEL: test_mm_cvtepi32_epi64: 280; SSE: # %bb.0: 281; SSE-NEXT: pmovsxdq %xmm0, %xmm0 282; SSE-NEXT: ret{{[l|q]}} 283; 284; AVX-LABEL: test_mm_cvtepi32_epi64: 285; AVX: # %bb.0: 286; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 287; AVX-NEXT: ret{{[l|q]}} 288 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 289 %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 290 %sext = sext <2 x i32> %ext0 to <2 x i64> 291 ret <2 x i64> %sext 292} 293 294define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) { 295; SSE-LABEL: test_mm_cvtepu8_epi16: 296; SSE: # %bb.0: 297; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 298; SSE-NEXT: ret{{[l|q]}} 299; 300; AVX-LABEL: test_mm_cvtepu8_epi16: 301; AVX: # %bb.0: 302; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 303; AVX-NEXT: ret{{[l|q]}} 304 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 305 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 306 %sext = zext <8 x i8> %ext0 to <8 x i16> 307 %res = bitcast <8 x i16> %sext to <2 x i64> 308 ret <2 x i64> %res 309} 310 311define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) { 312; SSE-LABEL: test_mm_cvtepu8_epi32: 313; SSE: # %bb.0: 314; SSE-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 315; SSE-NEXT: ret{{[l|q]}} 316; 317; AVX-LABEL: test_mm_cvtepu8_epi32: 318; AVX: # %bb.0: 319; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 320; AVX-NEXT: ret{{[l|q]}} 321 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 322 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 323 %sext = zext <4 x i8> %ext0 to <4 x i32> 324 %res = bitcast <4 x i32> %sext to <2 x i64> 325 ret <2 x i64> %res 326} 327 328define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) { 329; SSE-LABEL: test_mm_cvtepu8_epi64: 330; SSE: # %bb.0: 331; SSE-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 332; SSE-NEXT: ret{{[l|q]}} 333; 334; AVX-LABEL: test_mm_cvtepu8_epi64: 335; AVX: # %bb.0: 336; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 337; AVX-NEXT: ret{{[l|q]}} 338 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 339 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 340 %sext = zext <2 x i8> %ext0 to <2 x i64> 341 ret <2 x i64> %sext 342} 343 344define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) { 345; SSE-LABEL: test_mm_cvtepu16_epi32: 346; SSE: # %bb.0: 347; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 348; SSE-NEXT: ret{{[l|q]}} 349; 350; AVX-LABEL: test_mm_cvtepu16_epi32: 351; AVX: # %bb.0: 352; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 353; AVX-NEXT: ret{{[l|q]}} 354 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 355 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 356 %sext = zext <4 x i16> %ext0 to <4 x i32> 357 %res = bitcast <4 x i32> %sext to <2 x i64> 358 ret <2 x i64> %res 359} 360 361define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) { 362; SSE-LABEL: test_mm_cvtepu16_epi64: 363; SSE: # %bb.0: 364; SSE-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 365; SSE-NEXT: ret{{[l|q]}} 366; 367; AVX-LABEL: test_mm_cvtepu16_epi64: 368; AVX: # %bb.0: 369; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 370; AVX-NEXT: ret{{[l|q]}} 371 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 372 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 373 %sext = zext <2 x i16> %ext0 to <2 x i64> 374 ret <2 x i64> %sext 375} 376 377define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) { 378; SSE-LABEL: test_mm_cvtepu32_epi64: 379; SSE: # %bb.0: 380; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 381; SSE-NEXT: ret{{[l|q]}} 382; 383; AVX-LABEL: test_mm_cvtepu32_epi64: 384; AVX: # %bb.0: 385; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 386; AVX-NEXT: ret{{[l|q]}} 387 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 388 %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 389 %sext = zext <2 x i32> %ext0 to <2 x i64> 390 ret <2 x i64> %sext 391} 392 393define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) { 394; SSE-LABEL: test_mm_dp_pd: 395; SSE: # %bb.0: 396; SSE-NEXT: dppd $7, %xmm1, %xmm0 397; SSE-NEXT: ret{{[l|q]}} 398; 399; AVX-LABEL: test_mm_dp_pd: 400; AVX: # %bb.0: 401; AVX-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 402; AVX-NEXT: ret{{[l|q]}} 403 %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) 404 ret <2 x double> %res 405} 406declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone 407 408define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) { 409; SSE-LABEL: test_mm_dp_ps: 410; SSE: # %bb.0: 411; SSE-NEXT: dpps $7, %xmm1, %xmm0 412; SSE-NEXT: ret{{[l|q]}} 413; 414; AVX-LABEL: test_mm_dp_ps: 415; AVX: # %bb.0: 416; AVX-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 417; AVX-NEXT: ret{{[l|q]}} 418 %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) 419 ret <4 x float> %res 420} 421declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone 422 423define i32 @test_mm_extract_epi8(<2 x i64> %a0) { 424; SSE-LABEL: test_mm_extract_epi8: 425; SSE: # %bb.0: 426; SSE-NEXT: pextrb $1, %xmm0, %eax 427; SSE-NEXT: movzbl %al, %eax 428; SSE-NEXT: ret{{[l|q]}} 429; 430; AVX-LABEL: test_mm_extract_epi8: 431; AVX: # %bb.0: 432; AVX-NEXT: vpextrb $1, %xmm0, %eax 433; AVX-NEXT: movzbl %al, %eax 434; AVX-NEXT: ret{{[l|q]}} 435 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 436 %ext = extractelement <16 x i8> %arg0, i32 1 437 %res = zext i8 %ext to i32 438 ret i32 %res 439} 440 441define i32 @test_mm_extract_epi32(<2 x i64> %a0) { 442; SSE-LABEL: test_mm_extract_epi32: 443; SSE: # %bb.0: 444; SSE-NEXT: extractps $1, %xmm0, %eax 445; SSE-NEXT: ret{{[l|q]}} 446; 447; AVX-LABEL: test_mm_extract_epi32: 448; AVX: # %bb.0: 449; AVX-NEXT: vextractps $1, %xmm0, %eax 450; AVX-NEXT: ret{{[l|q]}} 451 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 452 %ext = extractelement <4 x i32> %arg0, i32 1 453 ret i32 %ext 454} 455 456define i64 @test_mm_extract_epi64(<2 x i64> %a0) { 457; X86-SSE-LABEL: test_mm_extract_epi64: 458; X86-SSE: # %bb.0: 459; X86-SSE-NEXT: extractps $2, %xmm0, %eax 460; X86-SSE-NEXT: extractps $3, %xmm0, %edx 461; X86-SSE-NEXT: retl 462; 463; X86-AVX-LABEL: test_mm_extract_epi64: 464; X86-AVX: # %bb.0: 465; X86-AVX-NEXT: vextractps $2, %xmm0, %eax 466; X86-AVX-NEXT: vextractps $3, %xmm0, %edx 467; X86-AVX-NEXT: retl 468; 469; X64-SSE-LABEL: test_mm_extract_epi64: 470; X64-SSE: # %bb.0: 471; X64-SSE-NEXT: pextrq $1, %xmm0, %rax 472; X64-SSE-NEXT: retq 473; 474; X64-AVX-LABEL: test_mm_extract_epi64: 475; X64-AVX: # %bb.0: 476; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax 477; X64-AVX-NEXT: retq 478 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 479 %ext = extractelement <2 x i64> %a0, i32 1 480 ret i64 %ext 481} 482 483define i32 @test_mm_extract_ps(<4 x float> %a0) { 484; SSE-LABEL: test_mm_extract_ps: 485; SSE: # %bb.0: 486; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 487; SSE-NEXT: movd %xmm0, %eax 488; SSE-NEXT: ret{{[l|q]}} 489; 490; AVX-LABEL: test_mm_extract_ps: 491; AVX: # %bb.0: 492; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 493; AVX-NEXT: vmovd %xmm0, %eax 494; AVX-NEXT: ret{{[l|q]}} 495 %ext = extractelement <4 x float> %a0, i32 1 496 %bc = bitcast float %ext to i32 497 ret i32 %bc 498} 499 500define <2 x double> @test_mm_floor_pd(<2 x double> %a0) { 501; SSE-LABEL: test_mm_floor_pd: 502; SSE: # %bb.0: 503; SSE-NEXT: roundpd $1, %xmm0, %xmm0 504; SSE-NEXT: ret{{[l|q]}} 505; 506; AVX-LABEL: test_mm_floor_pd: 507; AVX: # %bb.0: 508; AVX-NEXT: vroundpd $1, %xmm0, %xmm0 509; AVX-NEXT: ret{{[l|q]}} 510 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1) 511 ret <2 x double> %res 512} 513 514define <4 x float> @test_mm_floor_ps(<4 x float> %a0) { 515; SSE-LABEL: test_mm_floor_ps: 516; SSE: # %bb.0: 517; SSE-NEXT: roundps $1, %xmm0, %xmm0 518; SSE-NEXT: ret{{[l|q]}} 519; 520; AVX-LABEL: test_mm_floor_ps: 521; AVX: # %bb.0: 522; AVX-NEXT: vroundps $1, %xmm0, %xmm0 523; AVX-NEXT: ret{{[l|q]}} 524 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1) 525 ret <4 x float> %res 526} 527 528define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) { 529; SSE-LABEL: test_mm_floor_sd: 530; SSE: # %bb.0: 531; SSE-NEXT: roundsd $1, %xmm1, %xmm0 532; SSE-NEXT: ret{{[l|q]}} 533; 534; AVX-LABEL: test_mm_floor_sd: 535; AVX: # %bb.0: 536; AVX-NEXT: vroundsd $1, %xmm1, %xmm0, %xmm0 537; AVX-NEXT: ret{{[l|q]}} 538 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1) 539 ret <2 x double> %res 540} 541 542define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) { 543; SSE-LABEL: test_mm_floor_ss: 544; SSE: # %bb.0: 545; SSE-NEXT: roundss $1, %xmm1, %xmm0 546; SSE-NEXT: ret{{[l|q]}} 547; 548; AVX-LABEL: test_mm_floor_ss: 549; AVX: # %bb.0: 550; AVX-NEXT: vroundss $1, %xmm1, %xmm0, %xmm0 551; AVX-NEXT: ret{{[l|q]}} 552 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1) 553 ret <4 x float> %res 554} 555 556define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) { 557; X86-SSE-LABEL: test_mm_insert_epi8: 558; X86-SSE: # %bb.0: 559; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax 560; X86-SSE-NEXT: pinsrb $1, %eax, %xmm0 561; X86-SSE-NEXT: retl 562; 563; X86-AVX-LABEL: test_mm_insert_epi8: 564; X86-AVX: # %bb.0: 565; X86-AVX-NEXT: movzbl {{[0-9]+}}(%esp), %eax 566; X86-AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 567; X86-AVX-NEXT: retl 568; 569; X64-SSE-LABEL: test_mm_insert_epi8: 570; X64-SSE: # %bb.0: 571; X64-SSE-NEXT: movzbl %dil, %eax 572; X64-SSE-NEXT: pinsrb $1, %eax, %xmm0 573; X64-SSE-NEXT: retq 574; 575; X64-AVX-LABEL: test_mm_insert_epi8: 576; X64-AVX: # %bb.0: 577; X64-AVX-NEXT: movzbl %dil, %eax 578; X64-AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 579; X64-AVX-NEXT: retq 580 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 581 %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1 582 %bc = bitcast <16 x i8> %res to <2 x i64> 583 ret <2 x i64> %bc 584} 585 586define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) { 587; X86-SSE-LABEL: test_mm_insert_epi32: 588; X86-SSE: # %bb.0: 589; X86-SSE-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 590; X86-SSE-NEXT: retl 591; 592; X86-AVX-LABEL: test_mm_insert_epi32: 593; X86-AVX: # %bb.0: 594; X86-AVX-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 595; X86-AVX-NEXT: retl 596; 597; X64-SSE-LABEL: test_mm_insert_epi32: 598; X64-SSE: # %bb.0: 599; X64-SSE-NEXT: pinsrd $1, %edi, %xmm0 600; X64-SSE-NEXT: retq 601; 602; X64-AVX-LABEL: test_mm_insert_epi32: 603; X64-AVX: # %bb.0: 604; X64-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 605; X64-AVX-NEXT: retq 606 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 607 %res = insertelement <4 x i32> %arg0, i32 %a1,i32 1 608 %bc = bitcast <4 x i32> %res to <2 x i64> 609 ret <2 x i64> %bc 610} 611 612define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) { 613; X86-SSE-LABEL: test_mm_insert_epi64: 614; X86-SSE: # %bb.0: 615; X86-SSE-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0 616; X86-SSE-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0 617; X86-SSE-NEXT: retl 618; 619; X86-AVX-LABEL: test_mm_insert_epi64: 620; X86-AVX: # %bb.0: 621; X86-AVX-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 622; X86-AVX-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 623; X86-AVX-NEXT: retl 624; 625; X64-SSE-LABEL: test_mm_insert_epi64: 626; X64-SSE: # %bb.0: 627; X64-SSE-NEXT: pinsrq $1, %rdi, %xmm0 628; X64-SSE-NEXT: retq 629; 630; X64-AVX-LABEL: test_mm_insert_epi64: 631; X64-AVX: # %bb.0: 632; X64-AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 633; X64-AVX-NEXT: retq 634 %res = insertelement <2 x i64> %a0, i64 %a1,i32 1 635 ret <2 x i64> %res 636} 637 638define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) { 639; SSE-LABEL: test_mm_insert_ps: 640; SSE: # %bb.0: 641; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3] 642; SSE-NEXT: ret{{[l|q]}} 643; 644; AVX-LABEL: test_mm_insert_ps: 645; AVX: # %bb.0: 646; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3] 647; AVX-NEXT: ret{{[l|q]}} 648 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4) 649 ret <4 x float> %res 650} 651declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone 652 653define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) { 654; SSE-LABEL: test_mm_max_epi8: 655; SSE: # %bb.0: 656; SSE-NEXT: pmaxsb %xmm1, %xmm0 657; SSE-NEXT: ret{{[l|q]}} 658; 659; AVX-LABEL: test_mm_max_epi8: 660; AVX: # %bb.0: 661; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 662; AVX-NEXT: ret{{[l|q]}} 663 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 664 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 665 %cmp = icmp sgt <16 x i8> %arg0, %arg1 666 %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 667 %bc = bitcast <16 x i8> %sel to <2 x i64> 668 ret <2 x i64> %bc 669} 670 671define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) { 672; SSE-LABEL: test_mm_max_epi32: 673; SSE: # %bb.0: 674; SSE-NEXT: pmaxsd %xmm1, %xmm0 675; SSE-NEXT: ret{{[l|q]}} 676; 677; AVX-LABEL: test_mm_max_epi32: 678; AVX: # %bb.0: 679; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 680; AVX-NEXT: ret{{[l|q]}} 681 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 682 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 683 %cmp = icmp sgt <4 x i32> %arg0, %arg1 684 %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 685 %bc = bitcast <4 x i32> %sel to <2 x i64> 686 ret <2 x i64> %bc 687} 688 689define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) { 690; SSE-LABEL: test_mm_max_epu16: 691; SSE: # %bb.0: 692; SSE-NEXT: pmaxuw %xmm1, %xmm0 693; SSE-NEXT: ret{{[l|q]}} 694; 695; AVX-LABEL: test_mm_max_epu16: 696; AVX: # %bb.0: 697; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 698; AVX-NEXT: ret{{[l|q]}} 699 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 700 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 701 %cmp = icmp ugt <8 x i16> %arg0, %arg1 702 %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 703 %bc = bitcast <8 x i16> %sel to <2 x i64> 704 ret <2 x i64> %bc 705} 706 707define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) { 708; SSE-LABEL: test_mm_max_epu32: 709; SSE: # %bb.0: 710; SSE-NEXT: pmaxud %xmm1, %xmm0 711; SSE-NEXT: ret{{[l|q]}} 712; 713; AVX-LABEL: test_mm_max_epu32: 714; AVX: # %bb.0: 715; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 716; AVX-NEXT: ret{{[l|q]}} 717 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 718 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 719 %cmp = icmp ugt <4 x i32> %arg0, %arg1 720 %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 721 %bc = bitcast <4 x i32> %sel to <2 x i64> 722 ret <2 x i64> %bc 723} 724 725define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) { 726; SSE-LABEL: test_mm_min_epi8: 727; SSE: # %bb.0: 728; SSE-NEXT: pminsb %xmm1, %xmm0 729; SSE-NEXT: ret{{[l|q]}} 730; 731; AVX-LABEL: test_mm_min_epi8: 732; AVX: # %bb.0: 733; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 734; AVX-NEXT: ret{{[l|q]}} 735 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 736 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 737 %cmp = icmp slt <16 x i8> %arg0, %arg1 738 %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1 739 %bc = bitcast <16 x i8> %sel to <2 x i64> 740 ret <2 x i64> %bc 741} 742 743define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) { 744; SSE-LABEL: test_mm_min_epi32: 745; SSE: # %bb.0: 746; SSE-NEXT: pminsd %xmm1, %xmm0 747; SSE-NEXT: ret{{[l|q]}} 748; 749; AVX-LABEL: test_mm_min_epi32: 750; AVX: # %bb.0: 751; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 752; AVX-NEXT: ret{{[l|q]}} 753 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 754 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 755 %cmp = icmp slt <4 x i32> %arg0, %arg1 756 %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 757 %bc = bitcast <4 x i32> %sel to <2 x i64> 758 ret <2 x i64> %bc 759} 760 761define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) { 762; SSE-LABEL: test_mm_min_epu16: 763; SSE: # %bb.0: 764; SSE-NEXT: pminuw %xmm1, %xmm0 765; SSE-NEXT: ret{{[l|q]}} 766; 767; AVX-LABEL: test_mm_min_epu16: 768; AVX: # %bb.0: 769; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 770; AVX-NEXT: ret{{[l|q]}} 771 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 772 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 773 %cmp = icmp ult <8 x i16> %arg0, %arg1 774 %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1 775 %bc = bitcast <8 x i16> %sel to <2 x i64> 776 ret <2 x i64> %bc 777} 778 779define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) { 780; SSE-LABEL: test_mm_min_epu32: 781; SSE: # %bb.0: 782; SSE-NEXT: pminud %xmm1, %xmm0 783; SSE-NEXT: ret{{[l|q]}} 784; 785; AVX-LABEL: test_mm_min_epu32: 786; AVX: # %bb.0: 787; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 788; AVX-NEXT: ret{{[l|q]}} 789 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 790 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 791 %cmp = icmp ult <4 x i32> %arg0, %arg1 792 %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1 793 %bc = bitcast <4 x i32> %sel to <2 x i64> 794 ret <2 x i64> %bc 795} 796 797define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) { 798; SSE-LABEL: test_mm_minpos_epu16: 799; SSE: # %bb.0: 800; SSE-NEXT: phminposuw %xmm0, %xmm0 801; SSE-NEXT: ret{{[l|q]}} 802; 803; AVX-LABEL: test_mm_minpos_epu16: 804; AVX: # %bb.0: 805; AVX-NEXT: vphminposuw %xmm0, %xmm0 806; AVX-NEXT: ret{{[l|q]}} 807 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 808 %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %arg0) 809 %bc = bitcast <8 x i16> %res to <2 x i64> 810 ret <2 x i64> %bc 811} 812declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone 813 814define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) { 815; SSE-LABEL: test_mm_mpsadbw_epu8: 816; SSE: # %bb.0: 817; SSE-NEXT: mpsadbw $1, %xmm1, %xmm0 818; SSE-NEXT: ret{{[l|q]}} 819; 820; AVX-LABEL: test_mm_mpsadbw_epu8: 821; AVX: # %bb.0: 822; AVX-NEXT: vmpsadbw $1, %xmm1, %xmm0, %xmm0 823; AVX-NEXT: ret{{[l|q]}} 824 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 825 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 826 %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %arg0, <16 x i8> %arg1, i8 1) 827 %bc = bitcast <8 x i16> %res to <2 x i64> 828 ret <2 x i64> %bc 829} 830declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone 831 832define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { 833; SSE-LABEL: test_mm_mul_epi32: 834; SSE: # %bb.0: 835; SSE-NEXT: psllq $32, %xmm0 836; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 837; SSE-NEXT: psrad $31, %xmm0 838; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 839; SSE-NEXT: psllq $32, %xmm1 840; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 841; SSE-NEXT: psrad $31, %xmm1 842; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 843; SSE-NEXT: pmuldq %xmm1, %xmm0 844; SSE-NEXT: ret{{[l|q]}} 845; 846; AVX1-LABEL: test_mm_mul_epi32: 847; AVX1: # %bb.0: 848; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 849; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 850; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 851; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 852; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 853; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 854; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 855; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 856; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 857; AVX1-NEXT: ret{{[l|q]}} 858; 859; AVX512-LABEL: test_mm_mul_epi32: 860; AVX512: # %bb.0: 861; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 862; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 863; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 864; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 865; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0 866; AVX512-NEXT: ret{{[l|q]}} 867 %A = shl <2 x i64> %a0, <i64 32, i64 32> 868 %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32> 869 %B = shl <2 x i64> %a1, <i64 32, i64 32> 870 %B1 = ashr exact <2 x i64> %B, <i64 32, i64 32> 871 %res = mul nsw <2 x i64> %A1, %B1 872 ret <2 x i64> %res 873} 874 875define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) { 876; SSE-LABEL: test_mm_mullo_epi32: 877; SSE: # %bb.0: 878; SSE-NEXT: pmulld %xmm1, %xmm0 879; SSE-NEXT: ret{{[l|q]}} 880; 881; AVX-LABEL: test_mm_mullo_epi32: 882; AVX: # %bb.0: 883; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 884; AVX-NEXT: ret{{[l|q]}} 885 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 886 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 887 %res = mul <4 x i32> %arg0, %arg1 888 %bc = bitcast <4 x i32> %res to <2 x i64> 889 ret <2 x i64> %bc 890} 891 892define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) { 893; SSE-LABEL: test_mm_packus_epi32: 894; SSE: # %bb.0: 895; SSE-NEXT: packusdw %xmm1, %xmm0 896; SSE-NEXT: ret{{[l|q]}} 897; 898; AVX-LABEL: test_mm_packus_epi32: 899; AVX: # %bb.0: 900; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 901; AVX-NEXT: ret{{[l|q]}} 902 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 903 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 904 %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1) 905 %bc = bitcast <8 x i16> %res to <2 x i64> 906 ret <2 x i64> %bc 907} 908declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone 909 910define <2 x double> @test_mm_round_pd(<2 x double> %a0) { 911; SSE-LABEL: test_mm_round_pd: 912; SSE: # %bb.0: 913; SSE-NEXT: roundpd $4, %xmm0, %xmm0 914; SSE-NEXT: ret{{[l|q]}} 915; 916; AVX-LABEL: test_mm_round_pd: 917; AVX: # %bb.0: 918; AVX-NEXT: vroundpd $4, %xmm0, %xmm0 919; AVX-NEXT: ret{{[l|q]}} 920 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4) 921 ret <2 x double> %res 922} 923 924define <4 x float> @test_mm_round_ps(<4 x float> %a0) { 925; SSE-LABEL: test_mm_round_ps: 926; SSE: # %bb.0: 927; SSE-NEXT: roundps $4, %xmm0, %xmm0 928; SSE-NEXT: ret{{[l|q]}} 929; 930; AVX-LABEL: test_mm_round_ps: 931; AVX: # %bb.0: 932; AVX-NEXT: vroundps $4, %xmm0, %xmm0 933; AVX-NEXT: ret{{[l|q]}} 934 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4) 935 ret <4 x float> %res 936} 937 938define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) { 939; SSE-LABEL: test_mm_round_sd: 940; SSE: # %bb.0: 941; SSE-NEXT: roundsd $4, %xmm1, %xmm0 942; SSE-NEXT: ret{{[l|q]}} 943; 944; AVX-LABEL: test_mm_round_sd: 945; AVX: # %bb.0: 946; AVX-NEXT: vroundsd $4, %xmm1, %xmm0, %xmm0 947; AVX-NEXT: ret{{[l|q]}} 948 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4) 949 ret <2 x double> %res 950} 951 952define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) { 953; SSE-LABEL: test_mm_round_ss: 954; SSE: # %bb.0: 955; SSE-NEXT: roundss $4, %xmm1, %xmm0 956; SSE-NEXT: ret{{[l|q]}} 957; 958; AVX-LABEL: test_mm_round_ss: 959; AVX: # %bb.0: 960; AVX-NEXT: vroundss $4, %xmm1, %xmm0, %xmm0 961; AVX-NEXT: ret{{[l|q]}} 962 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4) 963 ret <4 x float> %res 964} 965 966define <2 x i64> @test_mm_stream_load_si128(<2 x i64>* %a0) { 967; X86-SSE-LABEL: test_mm_stream_load_si128: 968; X86-SSE: # %bb.0: 969; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 970; X86-SSE-NEXT: movntdqa (%eax), %xmm0 971; X86-SSE-NEXT: retl 972; 973; X86-AVX-LABEL: test_mm_stream_load_si128: 974; X86-AVX: # %bb.0: 975; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 976; X86-AVX-NEXT: vmovntdqa (%eax), %xmm0 977; X86-AVX-NEXT: retl 978; 979; X64-SSE-LABEL: test_mm_stream_load_si128: 980; X64-SSE: # %bb.0: 981; X64-SSE-NEXT: movntdqa (%rdi), %xmm0 982; X64-SSE-NEXT: retq 983; 984; X64-AVX-LABEL: test_mm_stream_load_si128: 985; X64-AVX: # %bb.0: 986; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 987; X64-AVX-NEXT: retq 988 %arg0 = bitcast <2 x i64>* %a0 to i8* 989 %res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0) 990 ret <2 x i64> %res 991} 992declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone 993 994define i32 @test_mm_test_all_ones(<2 x i64> %a0) { 995; SSE-LABEL: test_mm_test_all_ones: 996; SSE: # %bb.0: 997; SSE-NEXT: pcmpeqd %xmm1, %xmm1 998; SSE-NEXT: xorl %eax, %eax 999; SSE-NEXT: ptest %xmm1, %xmm0 1000; SSE-NEXT: setb %al 1001; SSE-NEXT: ret{{[l|q]}} 1002; 1003; AVX-LABEL: test_mm_test_all_ones: 1004; AVX: # %bb.0: 1005; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1006; AVX-NEXT: xorl %eax, %eax 1007; AVX-NEXT: vptest %xmm1, %xmm0 1008; AVX-NEXT: setb %al 1009; AVX-NEXT: ret{{[l|q]}} 1010 %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>) 1011 ret i32 %res 1012} 1013declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 1014 1015define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) { 1016; SSE-LABEL: test_mm_test_all_zeros: 1017; SSE: # %bb.0: 1018; SSE-NEXT: xorl %eax, %eax 1019; SSE-NEXT: ptest %xmm1, %xmm0 1020; SSE-NEXT: sete %al 1021; SSE-NEXT: ret{{[l|q]}} 1022; 1023; AVX-LABEL: test_mm_test_all_zeros: 1024; AVX: # %bb.0: 1025; AVX-NEXT: xorl %eax, %eax 1026; AVX-NEXT: vptest %xmm1, %xmm0 1027; AVX-NEXT: sete %al 1028; AVX-NEXT: ret{{[l|q]}} 1029 %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) 1030 ret i32 %res 1031} 1032declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 1033 1034define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) { 1035; SSE-LABEL: test_mm_test_mix_ones_zeros: 1036; SSE: # %bb.0: 1037; SSE-NEXT: xorl %eax, %eax 1038; SSE-NEXT: ptest %xmm1, %xmm0 1039; SSE-NEXT: seta %al 1040; SSE-NEXT: ret{{[l|q]}} 1041; 1042; AVX-LABEL: test_mm_test_mix_ones_zeros: 1043; AVX: # %bb.0: 1044; AVX-NEXT: xorl %eax, %eax 1045; AVX-NEXT: vptest %xmm1, %xmm0 1046; AVX-NEXT: seta %al 1047; AVX-NEXT: ret{{[l|q]}} 1048 %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) 1049 ret i32 %res 1050} 1051declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 1052 1053define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) { 1054; SSE-LABEL: test_mm_testc_si128: 1055; SSE: # %bb.0: 1056; SSE-NEXT: xorl %eax, %eax 1057; SSE-NEXT: ptest %xmm1, %xmm0 1058; SSE-NEXT: setb %al 1059; SSE-NEXT: ret{{[l|q]}} 1060; 1061; AVX-LABEL: test_mm_testc_si128: 1062; AVX: # %bb.0: 1063; AVX-NEXT: xorl %eax, %eax 1064; AVX-NEXT: vptest %xmm1, %xmm0 1065; AVX-NEXT: setb %al 1066; AVX-NEXT: ret{{[l|q]}} 1067 %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) 1068 ret i32 %res 1069} 1070 1071define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) { 1072; SSE-LABEL: test_mm_testnzc_si128: 1073; SSE: # %bb.0: 1074; SSE-NEXT: xorl %eax, %eax 1075; SSE-NEXT: ptest %xmm1, %xmm0 1076; SSE-NEXT: seta %al 1077; SSE-NEXT: ret{{[l|q]}} 1078; 1079; AVX-LABEL: test_mm_testnzc_si128: 1080; AVX: # %bb.0: 1081; AVX-NEXT: xorl %eax, %eax 1082; AVX-NEXT: vptest %xmm1, %xmm0 1083; AVX-NEXT: seta %al 1084; AVX-NEXT: ret{{[l|q]}} 1085 %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) 1086 ret i32 %res 1087} 1088 1089define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) { 1090; SSE-LABEL: test_mm_testz_si128: 1091; SSE: # %bb.0: 1092; SSE-NEXT: xorl %eax, %eax 1093; SSE-NEXT: ptest %xmm1, %xmm0 1094; SSE-NEXT: sete %al 1095; SSE-NEXT: ret{{[l|q]}} 1096; 1097; AVX-LABEL: test_mm_testz_si128: 1098; AVX: # %bb.0: 1099; AVX-NEXT: xorl %eax, %eax 1100; AVX-NEXT: vptest %xmm1, %xmm0 1101; AVX-NEXT: sete %al 1102; AVX-NEXT: ret{{[l|q]}} 1103 %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) 1104 ret i32 %res 1105} 1106