1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c 6 7define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 8; X32-LABEL: test_mm_add_ps: 9; X32: # BB#0: 10; X32-NEXT: addps %xmm1, %xmm0 11; X32-NEXT: retl 12; 13; X64-LABEL: test_mm_add_ps: 14; X64: # BB#0: 15; X64-NEXT: addps %xmm1, %xmm0 16; X64-NEXT: retq 17 %res = fadd <4 x float> %a0, %a1 18 ret <4 x float> %res 19} 20 21define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 22; X32-LABEL: test_mm_add_ss: 23; X32: # BB#0: 24; X32-NEXT: addss %xmm1, %xmm0 25; X32-NEXT: retl 26; 27; X64-LABEL: test_mm_add_ss: 28; X64: # BB#0: 29; X64-NEXT: addss %xmm1, %xmm0 30; X64-NEXT: retq 31 %ext0 = extractelement <4 x float> %a0, i32 0 32 %ext1 = extractelement <4 x float> %a1, i32 0 33 %fadd = fadd float %ext0, %ext1 34 %res = insertelement <4 x float> %a0, float %fadd, i32 0 35 ret <4 x float> %res 36} 37 38define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 39; X32-LABEL: test_mm_and_ps: 40; X32: # BB#0: 41; X32-NEXT: pushl %ebp 42; X32-NEXT: movl %esp, %ebp 43; X32-NEXT: pushl %esi 44; X32-NEXT: andl $-16, %esp 45; X32-NEXT: subl $64, %esp 46; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) 47; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 48; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 49; X32-NEXT: movl {{[0-9]+}}(%esp), %edx 50; X32-NEXT: movl {{[0-9]+}}(%esp), %esi 51; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) 52; X32-NEXT: andl {{[0-9]+}}(%esp), %esi 53; X32-NEXT: movl %esi, (%esp) 54; X32-NEXT: andl {{[0-9]+}}(%esp), %edx 55; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) 56; X32-NEXT: andl {{[0-9]+}}(%esp), %eax 57; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) 58; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx 59; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) 60; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 61; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 62; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 63; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 64; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 65; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 66; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 67; X32-NEXT: leal -4(%ebp), %esp 68; X32-NEXT: popl %esi 69; X32-NEXT: popl %ebp 70; X32-NEXT: retl 71; 72; X64-LABEL: test_mm_and_ps: 73; X64: # BB#0: 74; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 75; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax 76; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 77; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 78; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx 79; X64-NEXT: movq %rdx, %rsi 80; X64-NEXT: andl %eax, %edx 81; X64-NEXT: shrq $32, %rax 82; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx 83; X64-NEXT: movq %rcx, %rdi 84; X64-NEXT: andl %r8d, %ecx 85; X64-NEXT: shrq $32, %r8 86; X64-NEXT: shrq $32, %rsi 87; X64-NEXT: shrq $32, %rdi 88; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 89; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) 90; X64-NEXT: andl %r8d, %edi 91; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) 92; X64-NEXT: andl %eax, %esi 93; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) 94; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 95; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 96; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 97; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 98; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 99; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 100; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 101; X64-NEXT: retq 102 %arg0 = bitcast <4 x float> %a0 to <4 x i32> 103 %arg1 = bitcast <4 x float> %a1 to <4 x i32> 104 %res = and <4 x i32> %arg0, %arg1 105 %bc = bitcast <4 x i32> %res to <4 x float> 106 ret <4 x float> %bc 107} 108 109define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 110; X32-LABEL: test_mm_andnot_ps: 111; X32: # BB#0: 112; X32-NEXT: pushl %ebp 113; X32-NEXT: movl %esp, %ebp 114; X32-NEXT: pushl %esi 115; X32-NEXT: andl $-16, %esp 116; X32-NEXT: subl $64, %esp 117; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) 118; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 119; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 120; X32-NEXT: movl {{[0-9]+}}(%esp), %edx 121; X32-NEXT: movl {{[0-9]+}}(%esp), %esi 122; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) 123; X32-NEXT: notl %edx 124; X32-NEXT: notl %ecx 125; X32-NEXT: notl %esi 126; X32-NEXT: notl %eax 127; X32-NEXT: andl {{[0-9]+}}(%esp), %eax 128; X32-NEXT: movl %eax, (%esp) 129; X32-NEXT: andl {{[0-9]+}}(%esp), %esi 130; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) 131; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx 132; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) 133; X32-NEXT: andl {{[0-9]+}}(%esp), %edx 134; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) 135; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 136; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 137; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 138; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 139; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 140; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 141; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 142; X32-NEXT: leal -4(%ebp), %esp 143; X32-NEXT: popl %esi 144; X32-NEXT: popl %ebp 145; X32-NEXT: retl 146; 147; X64-LABEL: test_mm_andnot_ps: 148; X64: # BB#0: 149; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 150; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax 151; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx 152; X64-NEXT: movq %rcx, %rdx 153; X64-NEXT: shrq $32, %rdx 154; X64-NEXT: movq %rax, %rsi 155; X64-NEXT: shrq $32, %rsi 156; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 157; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi 158; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 159; X64-NEXT: notl %eax 160; X64-NEXT: andl %edi, %eax 161; X64-NEXT: shrq $32, %rdi 162; X64-NEXT: notl %ecx 163; X64-NEXT: andl %r8d, %ecx 164; X64-NEXT: shrq $32, %r8 165; X64-NEXT: notl %esi 166; X64-NEXT: notl %edx 167; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 168; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) 169; X64-NEXT: andl %r8d, %edx 170; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) 171; X64-NEXT: andl %edi, %esi 172; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) 173; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 174; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 175; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 176; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 177; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 178; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 179; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 180; X64-NEXT: retq 181 %arg0 = bitcast <4 x float> %a0 to <4 x i32> 182 %arg1 = bitcast <4 x float> %a1 to <4 x i32> 183 %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1> 184 %res = and <4 x i32> %not, %arg1 185 %bc = bitcast <4 x i32> %res to <4 x float> 186 ret <4 x float> %bc 187} 188 189define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 190; X32-LABEL: test_mm_cmpeq_ps: 191; X32: # BB#0: 192; X32-NEXT: cmpeqps %xmm1, %xmm0 193; X32-NEXT: retl 194; 195; X64-LABEL: test_mm_cmpeq_ps: 196; X64: # BB#0: 197; X64-NEXT: cmpeqps %xmm1, %xmm0 198; X64-NEXT: retq 199 %cmp = fcmp oeq <4 x float> %a0, %a1 200 %sext = sext <4 x i1> %cmp to <4 x i32> 201 %res = bitcast <4 x i32> %sext to <4 x float> 202 ret <4 x float> %res 203} 204 205define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 206; X32-LABEL: test_mm_cmpeq_ss: 207; X32: # BB#0: 208; X32-NEXT: cmpeqss %xmm1, %xmm0 209; X32-NEXT: retl 210; 211; X64-LABEL: test_mm_cmpeq_ss: 212; X64: # BB#0: 213; X64-NEXT: cmpeqss %xmm1, %xmm0 214; X64-NEXT: retq 215 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0) 216 ret <4 x float> %res 217} 218declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone 219 220define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 221; X32-LABEL: test_mm_cmpge_ps: 222; X32: # BB#0: 223; X32-NEXT: cmpleps %xmm0, %xmm1 224; X32-NEXT: movaps %xmm1, %xmm0 225; X32-NEXT: retl 226; 227; X64-LABEL: test_mm_cmpge_ps: 228; X64: # BB#0: 229; X64-NEXT: cmpleps %xmm0, %xmm1 230; X64-NEXT: movaps %xmm1, %xmm0 231; X64-NEXT: retq 232 %cmp = fcmp ole <4 x float> %a1, %a0 233 %sext = sext <4 x i1> %cmp to <4 x i32> 234 %res = bitcast <4 x i32> %sext to <4 x float> 235 ret <4 x float> %res 236} 237 238define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 239; X32-LABEL: test_mm_cmpge_ss: 240; X32: # BB#0: 241; X32-NEXT: cmpless %xmm0, %xmm1 242; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 243; X32-NEXT: retl 244; 245; X64-LABEL: test_mm_cmpge_ss: 246; X64: # BB#0: 247; X64-NEXT: cmpless %xmm0, %xmm1 248; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 249; X64-NEXT: retq 250 %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2) 251 %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 252 ret <4 x float> %res 253} 254 255define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 256; X32-LABEL: test_mm_cmpgt_ps: 257; X32: # BB#0: 258; X32-NEXT: cmpltps %xmm0, %xmm1 259; X32-NEXT: movaps %xmm1, %xmm0 260; X32-NEXT: retl 261; 262; X64-LABEL: test_mm_cmpgt_ps: 263; X64: # BB#0: 264; X64-NEXT: cmpltps %xmm0, %xmm1 265; X64-NEXT: movaps %xmm1, %xmm0 266; X64-NEXT: retq 267 %cmp = fcmp olt <4 x float> %a1, %a0 268 %sext = sext <4 x i1> %cmp to <4 x i32> 269 %res = bitcast <4 x i32> %sext to <4 x float> 270 ret <4 x float> %res 271} 272 273define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 274; X32-LABEL: test_mm_cmpgt_ss: 275; X32: # BB#0: 276; X32-NEXT: cmpltss %xmm0, %xmm1 277; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 278; X32-NEXT: retl 279; 280; X64-LABEL: test_mm_cmpgt_ss: 281; X64: # BB#0: 282; X64-NEXT: cmpltss %xmm0, %xmm1 283; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 284; X64-NEXT: retq 285 %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1) 286 %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 287 ret <4 x float> %res 288} 289 290define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 291; X32-LABEL: test_mm_cmple_ps: 292; X32: # BB#0: 293; X32-NEXT: cmpleps %xmm1, %xmm0 294; X32-NEXT: retl 295; 296; X64-LABEL: test_mm_cmple_ps: 297; X64: # BB#0: 298; X64-NEXT: cmpleps %xmm1, %xmm0 299; X64-NEXT: retq 300 %cmp = fcmp ole <4 x float> %a0, %a1 301 %sext = sext <4 x i1> %cmp to <4 x i32> 302 %res = bitcast <4 x i32> %sext to <4 x float> 303 ret <4 x float> %res 304} 305 306define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 307; X32-LABEL: test_mm_cmple_ss: 308; X32: # BB#0: 309; X32-NEXT: cmpless %xmm1, %xmm0 310; X32-NEXT: retl 311; 312; X64-LABEL: test_mm_cmple_ss: 313; X64: # BB#0: 314; X64-NEXT: cmpless %xmm1, %xmm0 315; X64-NEXT: retq 316 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2) 317 ret <4 x float> %res 318} 319 320define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 321; X32-LABEL: test_mm_cmplt_ps: 322; X32: # BB#0: 323; X32-NEXT: cmpltps %xmm1, %xmm0 324; X32-NEXT: retl 325; 326; X64-LABEL: test_mm_cmplt_ps: 327; X64: # BB#0: 328; X64-NEXT: cmpltps %xmm1, %xmm0 329; X64-NEXT: retq 330 %cmp = fcmp olt <4 x float> %a0, %a1 331 %sext = sext <4 x i1> %cmp to <4 x i32> 332 %res = bitcast <4 x i32> %sext to <4 x float> 333 ret <4 x float> %res 334} 335 336define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 337; X32-LABEL: test_mm_cmplt_ss: 338; X32: # BB#0: 339; X32-NEXT: cmpltss %xmm1, %xmm0 340; X32-NEXT: retl 341; 342; X64-LABEL: test_mm_cmplt_ss: 343; X64: # BB#0: 344; X64-NEXT: cmpltss %xmm1, %xmm0 345; X64-NEXT: retq 346 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1) 347 ret <4 x float> %res 348} 349 350define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 351; X32-LABEL: test_mm_cmpneq_ps: 352; X32: # BB#0: 353; X32-NEXT: cmpneqps %xmm1, %xmm0 354; X32-NEXT: retl 355; 356; X64-LABEL: test_mm_cmpneq_ps: 357; X64: # BB#0: 358; X64-NEXT: cmpneqps %xmm1, %xmm0 359; X64-NEXT: retq 360 %cmp = fcmp une <4 x float> %a0, %a1 361 %sext = sext <4 x i1> %cmp to <4 x i32> 362 %res = bitcast <4 x i32> %sext to <4 x float> 363 ret <4 x float> %res 364} 365 366define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 367; X32-LABEL: test_mm_cmpneq_ss: 368; X32: # BB#0: 369; X32-NEXT: cmpneqss %xmm1, %xmm0 370; X32-NEXT: retl 371; 372; X64-LABEL: test_mm_cmpneq_ss: 373; X64: # BB#0: 374; X64-NEXT: cmpneqss %xmm1, %xmm0 375; X64-NEXT: retq 376 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4) 377 ret <4 x float> %res 378} 379 380define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 381; X32-LABEL: test_mm_cmpnge_ps: 382; X32: # BB#0: 383; X32-NEXT: cmpnleps %xmm0, %xmm1 384; X32-NEXT: movaps %xmm1, %xmm0 385; X32-NEXT: retl 386; 387; X64-LABEL: test_mm_cmpnge_ps: 388; X64: # BB#0: 389; X64-NEXT: cmpnleps %xmm0, %xmm1 390; X64-NEXT: movaps %xmm1, %xmm0 391; X64-NEXT: retq 392 %cmp = fcmp ugt <4 x float> %a1, %a0 393 %sext = sext <4 x i1> %cmp to <4 x i32> 394 %res = bitcast <4 x i32> %sext to <4 x float> 395 ret <4 x float> %res 396} 397 398define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 399; X32-LABEL: test_mm_cmpnge_ss: 400; X32: # BB#0: 401; X32-NEXT: cmpnless %xmm0, %xmm1 402; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 403; X32-NEXT: retl 404; 405; X64-LABEL: test_mm_cmpnge_ss: 406; X64: # BB#0: 407; X64-NEXT: cmpnless %xmm0, %xmm1 408; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 409; X64-NEXT: retq 410 %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6) 411 %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 412 ret <4 x float> %res 413} 414 415define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 416; X32-LABEL: test_mm_cmpngt_ps: 417; X32: # BB#0: 418; X32-NEXT: cmpnltps %xmm0, %xmm1 419; X32-NEXT: movaps %xmm1, %xmm0 420; X32-NEXT: retl 421; 422; X64-LABEL: test_mm_cmpngt_ps: 423; X64: # BB#0: 424; X64-NEXT: cmpnltps %xmm0, %xmm1 425; X64-NEXT: movaps %xmm1, %xmm0 426; X64-NEXT: retq 427 %cmp = fcmp uge <4 x float> %a1, %a0 428 %sext = sext <4 x i1> %cmp to <4 x i32> 429 %res = bitcast <4 x i32> %sext to <4 x float> 430 ret <4 x float> %res 431} 432 433define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 434; X32-LABEL: test_mm_cmpngt_ss: 435; X32: # BB#0: 436; X32-NEXT: cmpnltss %xmm0, %xmm1 437; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 438; X32-NEXT: retl 439; 440; X64-LABEL: test_mm_cmpngt_ss: 441; X64: # BB#0: 442; X64-NEXT: cmpnltss %xmm0, %xmm1 443; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 444; X64-NEXT: retq 445 %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5) 446 %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 447 ret <4 x float> %res 448} 449 450define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 451; X32-LABEL: test_mm_cmpnle_ps: 452; X32: # BB#0: 453; X32-NEXT: cmpnleps %xmm1, %xmm0 454; X32-NEXT: retl 455; 456; X64-LABEL: test_mm_cmpnle_ps: 457; X64: # BB#0: 458; X64-NEXT: cmpnleps %xmm1, %xmm0 459; X64-NEXT: retq 460 %cmp = fcmp ugt <4 x float> %a0, %a1 461 %sext = sext <4 x i1> %cmp to <4 x i32> 462 %res = bitcast <4 x i32> %sext to <4 x float> 463 ret <4 x float> %res 464} 465 466define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 467; X32-LABEL: test_mm_cmpnle_ss: 468; X32: # BB#0: 469; X32-NEXT: cmpnless %xmm1, %xmm0 470; X32-NEXT: retl 471; 472; X64-LABEL: test_mm_cmpnle_ss: 473; X64: # BB#0: 474; X64-NEXT: cmpnless %xmm1, %xmm0 475; X64-NEXT: retq 476 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6) 477 ret <4 x float> %res 478} 479 480define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 481; X32-LABEL: test_mm_cmpnlt_ps: 482; X32: # BB#0: 483; X32-NEXT: cmpnltps %xmm1, %xmm0 484; X32-NEXT: retl 485; 486; X64-LABEL: test_mm_cmpnlt_ps: 487; X64: # BB#0: 488; X64-NEXT: cmpnltps %xmm1, %xmm0 489; X64-NEXT: retq 490 %cmp = fcmp uge <4 x float> %a0, %a1 491 %sext = sext <4 x i1> %cmp to <4 x i32> 492 %res = bitcast <4 x i32> %sext to <4 x float> 493 ret <4 x float> %res 494} 495 496define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 497; X32-LABEL: test_mm_cmpnlt_ss: 498; X32: # BB#0: 499; X32-NEXT: cmpnltss %xmm1, %xmm0 500; X32-NEXT: retl 501; 502; X64-LABEL: test_mm_cmpnlt_ss: 503; X64: # BB#0: 504; X64-NEXT: cmpnltss %xmm1, %xmm0 505; X64-NEXT: retq 506 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5) 507 ret <4 x float> %res 508} 509 510define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 511; X32-LABEL: test_mm_cmpord_ps: 512; X32: # BB#0: 513; X32-NEXT: cmpordps %xmm1, %xmm0 514; X32-NEXT: retl 515; 516; X64-LABEL: test_mm_cmpord_ps: 517; X64: # BB#0: 518; X64-NEXT: cmpordps %xmm1, %xmm0 519; X64-NEXT: retq 520 %cmp = fcmp ord <4 x float> %a0, %a1 521 %sext = sext <4 x i1> %cmp to <4 x i32> 522 %res = bitcast <4 x i32> %sext to <4 x float> 523 ret <4 x float> %res 524} 525 526define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 527; X32-LABEL: test_mm_cmpord_ss: 528; X32: # BB#0: 529; X32-NEXT: cmpordss %xmm1, %xmm0 530; X32-NEXT: retl 531; 532; X64-LABEL: test_mm_cmpord_ss: 533; X64: # BB#0: 534; X64-NEXT: cmpordss %xmm1, %xmm0 535; X64-NEXT: retq 536 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) 537 ret <4 x float> %res 538} 539 540define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 541; X32-LABEL: test_mm_cmpunord_ps: 542; X32: # BB#0: 543; X32-NEXT: cmpunordps %xmm1, %xmm0 544; X32-NEXT: retl 545; 546; X64-LABEL: test_mm_cmpunord_ps: 547; X64: # BB#0: 548; X64-NEXT: cmpunordps %xmm1, %xmm0 549; X64-NEXT: retq 550 %cmp = fcmp uno <4 x float> %a0, %a1 551 %sext = sext <4 x i1> %cmp to <4 x i32> 552 %res = bitcast <4 x i32> %sext to <4 x float> 553 ret <4 x float> %res 554} 555 556define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 557; X32-LABEL: test_mm_cmpunord_ss: 558; X32: # BB#0: 559; X32-NEXT: cmpunordss %xmm1, %xmm0 560; X32-NEXT: retl 561; 562; X64-LABEL: test_mm_cmpunord_ss: 563; X64: # BB#0: 564; X64-NEXT: cmpunordss %xmm1, %xmm0 565; X64-NEXT: retq 566 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3) 567 ret <4 x float> %res 568} 569 570define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 571; X32-LABEL: test_mm_comieq_ss: 572; X32: # BB#0: 573; X32-NEXT: comiss %xmm1, %xmm0 574; X32-NEXT: setnp %al 575; X32-NEXT: sete %cl 576; X32-NEXT: andb %al, %cl 577; X32-NEXT: movzbl %cl, %eax 578; X32-NEXT: retl 579; 580; X64-LABEL: test_mm_comieq_ss: 581; X64: # BB#0: 582; X64-NEXT: comiss %xmm1, %xmm0 583; X64-NEXT: setnp %al 584; X64-NEXT: sete %cl 585; X64-NEXT: andb %al, %cl 586; X64-NEXT: movzbl %cl, %eax 587; X64-NEXT: retq 588 %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) 589 ret i32 %res 590} 591declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone 592 593define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 594; X32-LABEL: test_mm_comige_ss: 595; X32: # BB#0: 596; X32-NEXT: xorl %eax, %eax 597; X32-NEXT: comiss %xmm1, %xmm0 598; X32-NEXT: setae %al 599; X32-NEXT: retl 600; 601; X64-LABEL: test_mm_comige_ss: 602; X64: # BB#0: 603; X64-NEXT: xorl %eax, %eax 604; X64-NEXT: comiss %xmm1, %xmm0 605; X64-NEXT: setae %al 606; X64-NEXT: retq 607 %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) 608 ret i32 %res 609} 610declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone 611 612define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 613; X32-LABEL: test_mm_comigt_ss: 614; X32: # BB#0: 615; X32-NEXT: xorl %eax, %eax 616; X32-NEXT: comiss %xmm1, %xmm0 617; X32-NEXT: seta %al 618; X32-NEXT: retl 619; 620; X64-LABEL: test_mm_comigt_ss: 621; X64: # BB#0: 622; X64-NEXT: xorl %eax, %eax 623; X64-NEXT: comiss %xmm1, %xmm0 624; X64-NEXT: seta %al 625; X64-NEXT: retq 626 %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) 627 ret i32 %res 628} 629declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone 630 631define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 632; X32-LABEL: test_mm_comile_ss: 633; X32: # BB#0: 634; X32-NEXT: xorl %eax, %eax 635; X32-NEXT: comiss %xmm0, %xmm1 636; X32-NEXT: setae %al 637; X32-NEXT: retl 638; 639; X64-LABEL: test_mm_comile_ss: 640; X64: # BB#0: 641; X64-NEXT: xorl %eax, %eax 642; X64-NEXT: comiss %xmm0, %xmm1 643; X64-NEXT: setae %al 644; X64-NEXT: retq 645 %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) 646 ret i32 %res 647} 648declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone 649 650define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 651; X32-LABEL: test_mm_comilt_ss: 652; X32: # BB#0: 653; X32-NEXT: xorl %eax, %eax 654; X32-NEXT: comiss %xmm0, %xmm1 655; X32-NEXT: seta %al 656; X32-NEXT: retl 657; 658; X64-LABEL: test_mm_comilt_ss: 659; X64: # BB#0: 660; X64-NEXT: xorl %eax, %eax 661; X64-NEXT: comiss %xmm0, %xmm1 662; X64-NEXT: seta %al 663; X64-NEXT: retq 664 %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) 665 ret i32 %res 666} 667declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone 668 669define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 670; X32-LABEL: test_mm_comineq_ss: 671; X32: # BB#0: 672; X32-NEXT: comiss %xmm1, %xmm0 673; X32-NEXT: setp %al 674; X32-NEXT: setne %cl 675; X32-NEXT: orb %al, %cl 676; X32-NEXT: movzbl %cl, %eax 677; X32-NEXT: retl 678; 679; X64-LABEL: test_mm_comineq_ss: 680; X64: # BB#0: 681; X64-NEXT: comiss %xmm1, %xmm0 682; X64-NEXT: setp %al 683; X64-NEXT: setne %cl 684; X64-NEXT: orb %al, %cl 685; X64-NEXT: movzbl %cl, %eax 686; X64-NEXT: retq 687 %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) 688 ret i32 %res 689} 690declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone 691 692define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind { 693; X32-LABEL: test_mm_cvt_ss2si: 694; X32: # BB#0: 695; X32-NEXT: cvtss2si %xmm0, %eax 696; X32-NEXT: retl 697; 698; X64-LABEL: test_mm_cvt_ss2si: 699; X64: # BB#0: 700; X64-NEXT: cvtss2si %xmm0, %eax 701; X64-NEXT: retq 702 %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) 703 ret i32 %res 704} 705declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone 706 707define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind { 708; X32-LABEL: test_mm_cvtsi32_ss: 709; X32: # BB#0: 710; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 711; X32-NEXT: cvtsi2ssl %eax, %xmm1 712; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 713; X32-NEXT: retl 714; 715; X64-LABEL: test_mm_cvtsi32_ss: 716; X64: # BB#0: 717; X64-NEXT: cvtsi2ssl %edi, %xmm1 718; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 719; X64-NEXT: retq 720 %cvt = sitofp i32 %a1 to float 721 %res = insertelement <4 x float> %a0, float %cvt, i32 0 722 ret <4 x float> %res 723} 724 725define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind { 726; X32-LABEL: test_mm_cvtss_f32: 727; X32: # BB#0: 728; X32-NEXT: pushl %eax 729; X32-NEXT: movss %xmm0, (%esp) 730; X32-NEXT: flds (%esp) 731; X32-NEXT: popl %eax 732; X32-NEXT: retl 733; 734; X64-LABEL: test_mm_cvtss_f32: 735; X64: # BB#0: 736; X64-NEXT: retq 737 %res = extractelement <4 x float> %a0, i32 0 738 ret float %res 739} 740 741define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind { 742; X32-LABEL: test_mm_cvtss_si32: 743; X32: # BB#0: 744; X32-NEXT: cvtss2si %xmm0, %eax 745; X32-NEXT: retl 746; 747; X64-LABEL: test_mm_cvtss_si32: 748; X64: # BB#0: 749; X64-NEXT: cvtss2si %xmm0, %eax 750; X64-NEXT: retq 751 %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) 752 ret i32 %res 753} 754 755define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind { 756; X32-LABEL: test_mm_cvttss_si: 757; X32: # BB#0: 758; X32-NEXT: cvttss2si %xmm0, %eax 759; X32-NEXT: retl 760; 761; X64-LABEL: test_mm_cvttss_si: 762; X64: # BB#0: 763; X64-NEXT: cvttss2si %xmm0, %eax 764; X64-NEXT: retq 765 %cvt = extractelement <4 x float> %a0, i32 0 766 %res = fptosi float %cvt to i32 767 ret i32 %res 768} 769 770define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind { 771; X32-LABEL: test_mm_cvttss_si32: 772; X32: # BB#0: 773; X32-NEXT: cvttss2si %xmm0, %eax 774; X32-NEXT: retl 775; 776; X64-LABEL: test_mm_cvttss_si32: 777; X64: # BB#0: 778; X64-NEXT: cvttss2si %xmm0, %eax 779; X64-NEXT: retq 780 %cvt = extractelement <4 x float> %a0, i32 0 781 %res = fptosi float %cvt to i32 782 ret i32 %res 783} 784 785define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 786; X32-LABEL: test_mm_div_ps: 787; X32: # BB#0: 788; X32-NEXT: divps %xmm1, %xmm0 789; X32-NEXT: retl 790; 791; X64-LABEL: test_mm_div_ps: 792; X64: # BB#0: 793; X64-NEXT: divps %xmm1, %xmm0 794; X64-NEXT: retq 795 %res = fdiv <4 x float> %a0, %a1 796 ret <4 x float> %res 797} 798 799define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 800; X32-LABEL: test_mm_div_ss: 801; X32: # BB#0: 802; X32-NEXT: divss %xmm1, %xmm0 803; X32-NEXT: retl 804; 805; X64-LABEL: test_mm_div_ss: 806; X64: # BB#0: 807; X64-NEXT: divss %xmm1, %xmm0 808; X64-NEXT: retq 809 %ext0 = extractelement <4 x float> %a0, i32 0 810 %ext1 = extractelement <4 x float> %a1, i32 0 811 %fdiv = fdiv float %ext0, %ext1 812 %res = insertelement <4 x float> %a0, float %fdiv, i32 0 813 ret <4 x float> %res 814} 815 816define i32 @test_MM_GET_EXCEPTION_MASK() nounwind { 817; X32-LABEL: test_MM_GET_EXCEPTION_MASK: 818; X32: # BB#0: 819; X32-NEXT: pushl %eax 820; X32-NEXT: leal (%esp), %eax 821; X32-NEXT: stmxcsr (%eax) 822; X32-NEXT: movl (%esp), %eax 823; X32-NEXT: andl $8064, %eax # imm = 0x1F80 824; X32-NEXT: popl %ecx 825; X32-NEXT: retl 826; 827; X64-LABEL: test_MM_GET_EXCEPTION_MASK: 828; X64: # BB#0: 829; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 830; X64-NEXT: stmxcsr (%rax) 831; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax 832; X64-NEXT: andl $8064, %eax # imm = 0x1F80 833; X64-NEXT: retq 834 %1 = alloca i32, align 4 835 %2 = bitcast i32* %1 to i8* 836 call void @llvm.x86.sse.stmxcsr(i8* %2) 837 %3 = load i32, i32* %1, align 4 838 %4 = and i32 %3, 8064 839 ret i32 %4 840} 841declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone 842 843define i32 @test_MM_GET_EXCEPTION_STATE() nounwind { 844; X32-LABEL: test_MM_GET_EXCEPTION_STATE: 845; X32: # BB#0: 846; X32-NEXT: pushl %eax 847; X32-NEXT: leal (%esp), %eax 848; X32-NEXT: stmxcsr (%eax) 849; X32-NEXT: movl (%esp), %eax 850; X32-NEXT: andl $63, %eax 851; X32-NEXT: popl %ecx 852; X32-NEXT: retl 853; 854; X64-LABEL: test_MM_GET_EXCEPTION_STATE: 855; X64: # BB#0: 856; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 857; X64-NEXT: stmxcsr (%rax) 858; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax 859; X64-NEXT: andl $63, %eax 860; X64-NEXT: retq 861 %1 = alloca i32, align 4 862 %2 = bitcast i32* %1 to i8* 863 call void @llvm.x86.sse.stmxcsr(i8* %2) 864 %3 = load i32, i32* %1, align 4 865 %4 = and i32 %3, 63 866 ret i32 %4 867} 868 869define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind { 870; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE: 871; X32: # BB#0: 872; X32-NEXT: pushl %eax 873; X32-NEXT: leal (%esp), %eax 874; X32-NEXT: stmxcsr (%eax) 875; X32-NEXT: movl (%esp), %eax 876; X32-NEXT: andl $32768, %eax # imm = 0x8000 877; X32-NEXT: popl %ecx 878; X32-NEXT: retl 879; 880; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE: 881; X64: # BB#0: 882; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 883; X64-NEXT: stmxcsr (%rax) 884; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax 885; X64-NEXT: andl $32768, %eax # imm = 0x8000 886; X64-NEXT: retq 887 %1 = alloca i32, align 4 888 %2 = bitcast i32* %1 to i8* 889 call void @llvm.x86.sse.stmxcsr(i8* %2) 890 %3 = load i32, i32* %1, align 4 891 %4 = and i32 %3, 32768 892 ret i32 %4 893} 894 895define i32 @test_MM_GET_ROUNDING_MODE() nounwind { 896; X32-LABEL: test_MM_GET_ROUNDING_MODE: 897; X32: # BB#0: 898; X32-NEXT: pushl %eax 899; X32-NEXT: leal (%esp), %eax 900; X32-NEXT: stmxcsr (%eax) 901; X32-NEXT: movl (%esp), %eax 902; X32-NEXT: andl $24576, %eax # imm = 0x6000 903; X32-NEXT: popl %ecx 904; X32-NEXT: retl 905; 906; X64-LABEL: test_MM_GET_ROUNDING_MODE: 907; X64: # BB#0: 908; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 909; X64-NEXT: stmxcsr (%rax) 910; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax 911; X64-NEXT: andl $24576, %eax # imm = 0x6000 912; X64-NEXT: retq 913 %1 = alloca i32, align 4 914 %2 = bitcast i32* %1 to i8* 915 call void @llvm.x86.sse.stmxcsr(i8* %2) 916 %3 = load i32, i32* %1, align 4 917 %4 = and i32 %3, 24576 918 ret i32 %4 919} 920 921define i32 @test_mm_getcsr() nounwind { 922; X32-LABEL: test_mm_getcsr: 923; X32: # BB#0: 924; X32-NEXT: pushl %eax 925; X32-NEXT: leal (%esp), %eax 926; X32-NEXT: stmxcsr (%eax) 927; X32-NEXT: movl (%esp), %eax 928; X32-NEXT: popl %ecx 929; X32-NEXT: retl 930; 931; X64-LABEL: test_mm_getcsr: 932; X64: # BB#0: 933; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 934; X64-NEXT: stmxcsr (%rax) 935; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax 936; X64-NEXT: retq 937 %1 = alloca i32, align 4 938 %2 = bitcast i32* %1 to i8* 939 call void @llvm.x86.sse.stmxcsr(i8* %2) 940 %3 = load i32, i32* %1, align 4 941 ret i32 %3 942} 943 944define <4 x float> @test_mm_load_ps(float* %a0) nounwind { 945; X32-LABEL: test_mm_load_ps: 946; X32: # BB#0: 947; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 948; X32-NEXT: movaps (%eax), %xmm0 949; X32-NEXT: retl 950; 951; X64-LABEL: test_mm_load_ps: 952; X64: # BB#0: 953; X64-NEXT: movaps (%rdi), %xmm0 954; X64-NEXT: retq 955 %arg0 = bitcast float* %a0 to <4 x float>* 956 %res = load <4 x float>, <4 x float>* %arg0, align 16 957 ret <4 x float> %res 958} 959 960define <4 x float> @test_mm_load_ps1(float* %a0) nounwind { 961; X32-LABEL: test_mm_load_ps1: 962; X32: # BB#0: 963; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 964; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 965; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 966; X32-NEXT: retl 967; 968; X64-LABEL: test_mm_load_ps1: 969; X64: # BB#0: 970; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 971; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 972; X64-NEXT: retq 973 %ld = load float, float* %a0, align 4 974 %res0 = insertelement <4 x float> undef, float %ld, i32 0 975 %res1 = insertelement <4 x float> %res0, float %ld, i32 1 976 %res2 = insertelement <4 x float> %res1, float %ld, i32 2 977 %res3 = insertelement <4 x float> %res2, float %ld, i32 3 978 ret <4 x float> %res3 979} 980 981define <4 x float> @test_mm_load_ss(float* %a0) nounwind { 982; X32-LABEL: test_mm_load_ss: 983; X32: # BB#0: 984; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 985; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 986; X32-NEXT: retl 987; 988; X64-LABEL: test_mm_load_ss: 989; X64: # BB#0: 990; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 991; X64-NEXT: retq 992 %ld = load float, float* %a0, align 1 993 %res0 = insertelement <4 x float> undef, float %ld, i32 0 994 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 995 %res2 = insertelement <4 x float> %res1, float 0.0, i32 2 996 %res3 = insertelement <4 x float> %res2, float 0.0, i32 3 997 ret <4 x float> %res3 998} 999 1000define <4 x float> @test_mm_load1_ps(float* %a0) nounwind { 1001; X32-LABEL: test_mm_load1_ps: 1002; X32: # BB#0: 1003; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1004; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1005; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1006; X32-NEXT: retl 1007; 1008; X64-LABEL: test_mm_load1_ps: 1009; X64: # BB#0: 1010; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1011; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1012; X64-NEXT: retq 1013 %ld = load float, float* %a0, align 4 1014 %res0 = insertelement <4 x float> undef, float %ld, i32 0 1015 %res1 = insertelement <4 x float> %res0, float %ld, i32 1 1016 %res2 = insertelement <4 x float> %res1, float %ld, i32 2 1017 %res3 = insertelement <4 x float> %res2, float %ld, i32 3 1018 ret <4 x float> %res3 1019} 1020 1021define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) { 1022; X32-LABEL: test_mm_loadh_pi: 1023; X32: # BB#0: 1024; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1025; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1026; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1027; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1028; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1029; X32-NEXT: retl 1030; 1031; X64-LABEL: test_mm_loadh_pi: 1032; X64: # BB#0: 1033; X64-NEXT: movq (%rdi), %rax 1034; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) 1035; X64-NEXT: shrq $32, %rax 1036; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) 1037; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1038; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1039; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1040; X64-NEXT: xorps %xmm2, %xmm2 1041; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1042; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1043; X64-NEXT: retq 1044 %ptr = bitcast x86_mmx* %a1 to <2 x float>* 1045 %ld = load <2 x float>, <2 x float>* %ptr 1046 %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1047 %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1048 ret <4 x float> %res 1049} 1050 1051define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) { 1052; X32-LABEL: test_mm_loadl_pi: 1053; X32: # BB#0: 1054; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1055; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1056; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1057; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1058; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1059; X32-NEXT: movaps %xmm1, %xmm0 1060; X32-NEXT: retl 1061; 1062; X64-LABEL: test_mm_loadl_pi: 1063; X64: # BB#0: 1064; X64-NEXT: movq (%rdi), %rax 1065; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) 1066; X64-NEXT: shrq $32, %rax 1067; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) 1068; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1069; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1070; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1071; X64-NEXT: xorps %xmm2, %xmm2 1072; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1073; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1074; X64-NEXT: movaps %xmm1, %xmm0 1075; X64-NEXT: retq 1076 %ptr = bitcast x86_mmx* %a1 to <2 x float>* 1077 %ld = load <2 x float>, <2 x float>* %ptr 1078 %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1079 %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1080 ret <4 x float> %res 1081} 1082 1083define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind { 1084; X32-LABEL: test_mm_loadr_ps: 1085; X32: # BB#0: 1086; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1087; X32-NEXT: movaps (%eax), %xmm0 1088; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 1089; X32-NEXT: retl 1090; 1091; X64-LABEL: test_mm_loadr_ps: 1092; X64: # BB#0: 1093; X64-NEXT: movaps (%rdi), %xmm0 1094; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 1095; X64-NEXT: retq 1096 %arg0 = bitcast float* %a0 to <4 x float>* 1097 %ld = load <4 x float>, <4 x float>* %arg0, align 16 1098 %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1099 ret <4 x float> %res 1100} 1101 1102define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind { 1103; X32-LABEL: test_mm_loadu_ps: 1104; X32: # BB#0: 1105; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1106; X32-NEXT: movups (%eax), %xmm0 1107; X32-NEXT: retl 1108; 1109; X64-LABEL: test_mm_loadu_ps: 1110; X64: # BB#0: 1111; X64-NEXT: movups (%rdi), %xmm0 1112; X64-NEXT: retq 1113 %arg0 = bitcast float* %a0 to <4 x float>* 1114 %res = load <4 x float>, <4 x float>* %arg0, align 1 1115 ret <4 x float> %res 1116} 1117 1118define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) { 1119; X32-LABEL: test_mm_max_ps: 1120; X32: # BB#0: 1121; X32-NEXT: maxps %xmm1, %xmm0 1122; X32-NEXT: retl 1123; 1124; X64-LABEL: test_mm_max_ps: 1125; X64: # BB#0: 1126; X64-NEXT: maxps %xmm1, %xmm0 1127; X64-NEXT: retq 1128 %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) 1129 ret <4 x float> %res 1130} 1131declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone 1132 1133define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) { 1134; X32-LABEL: test_mm_max_ss: 1135; X32: # BB#0: 1136; X32-NEXT: maxss %xmm1, %xmm0 1137; X32-NEXT: retl 1138; 1139; X64-LABEL: test_mm_max_ss: 1140; X64: # BB#0: 1141; X64-NEXT: maxss %xmm1, %xmm0 1142; X64-NEXT: retq 1143 %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) 1144 ret <4 x float> %res 1145} 1146declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone 1147 1148define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) { 1149; X32-LABEL: test_mm_min_ps: 1150; X32: # BB#0: 1151; X32-NEXT: minps %xmm1, %xmm0 1152; X32-NEXT: retl 1153; 1154; X64-LABEL: test_mm_min_ps: 1155; X64: # BB#0: 1156; X64-NEXT: minps %xmm1, %xmm0 1157; X64-NEXT: retq 1158 %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) 1159 ret <4 x float> %res 1160} 1161declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone 1162 1163define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) { 1164; X32-LABEL: test_mm_min_ss: 1165; X32: # BB#0: 1166; X32-NEXT: minss %xmm1, %xmm0 1167; X32-NEXT: retl 1168; 1169; X64-LABEL: test_mm_min_ss: 1170; X64: # BB#0: 1171; X64-NEXT: minss %xmm1, %xmm0 1172; X64-NEXT: retq 1173 %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) 1174 ret <4 x float> %res 1175} 1176declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone 1177 1178define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) { 1179; X32-LABEL: test_mm_move_ss: 1180; X32: # BB#0: 1181; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1182; X32-NEXT: retl 1183; 1184; X64-LABEL: test_mm_move_ss: 1185; X64: # BB#0: 1186; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1187; X64-NEXT: retq 1188 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1189 ret <4 x float> %res 1190} 1191 1192define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) { 1193; X32-LABEL: test_mm_movehl_ps: 1194; X32: # BB#0: 1195; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1196; X32-NEXT: retl 1197; 1198; X64-LABEL: test_mm_movehl_ps: 1199; X64: # BB#0: 1200; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1201; X64-NEXT: retq 1202 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 1203 ret <4 x float> %res 1204} 1205 1206define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) { 1207; X32-LABEL: test_mm_movelh_ps: 1208; X32: # BB#0: 1209; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1210; X32-NEXT: retl 1211; 1212; X64-LABEL: test_mm_movelh_ps: 1213; X64: # BB#0: 1214; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1215; X64-NEXT: retq 1216 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1217 ret <4 x float> %res 1218} 1219 1220define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind { 1221; X32-LABEL: test_mm_movemask_ps: 1222; X32: # BB#0: 1223; X32-NEXT: movmskps %xmm0, %eax 1224; X32-NEXT: retl 1225; 1226; X64-LABEL: test_mm_movemask_ps: 1227; X64: # BB#0: 1228; X64-NEXT: movmskps %xmm0, %eax 1229; X64-NEXT: retq 1230 %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) 1231 ret i32 %res 1232} 1233declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone 1234 1235define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 1236; X32-LABEL: test_mm_mul_ps: 1237; X32: # BB#0: 1238; X32-NEXT: mulps %xmm1, %xmm0 1239; X32-NEXT: retl 1240; 1241; X64-LABEL: test_mm_mul_ps: 1242; X64: # BB#0: 1243; X64-NEXT: mulps %xmm1, %xmm0 1244; X64-NEXT: retq 1245 %res = fmul <4 x float> %a0, %a1 1246 ret <4 x float> %res 1247} 1248 1249define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 1250; X32-LABEL: test_mm_mul_ss: 1251; X32: # BB#0: 1252; X32-NEXT: mulss %xmm1, %xmm0 1253; X32-NEXT: retl 1254; 1255; X64-LABEL: test_mm_mul_ss: 1256; X64: # BB#0: 1257; X64-NEXT: mulss %xmm1, %xmm0 1258; X64-NEXT: retq 1259 %ext0 = extractelement <4 x float> %a0, i32 0 1260 %ext1 = extractelement <4 x float> %a1, i32 0 1261 %fmul = fmul float %ext0, %ext1 1262 %res = insertelement <4 x float> %a0, float %fmul, i32 0 1263 ret <4 x float> %res 1264} 1265 1266define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 1267; X32-LABEL: test_mm_or_ps: 1268; X32: # BB#0: 1269; X32-NEXT: pushl %ebp 1270; X32-NEXT: movl %esp, %ebp 1271; X32-NEXT: pushl %esi 1272; X32-NEXT: andl $-16, %esp 1273; X32-NEXT: subl $64, %esp 1274; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) 1275; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1276; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 1277; X32-NEXT: movl {{[0-9]+}}(%esp), %edx 1278; X32-NEXT: movl {{[0-9]+}}(%esp), %esi 1279; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) 1280; X32-NEXT: orl {{[0-9]+}}(%esp), %esi 1281; X32-NEXT: movl %esi, (%esp) 1282; X32-NEXT: orl {{[0-9]+}}(%esp), %edx 1283; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) 1284; X32-NEXT: orl {{[0-9]+}}(%esp), %eax 1285; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) 1286; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx 1287; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) 1288; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1289; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1290; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1291; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1292; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1293; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1294; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1295; X32-NEXT: leal -4(%ebp), %esp 1296; X32-NEXT: popl %esi 1297; X32-NEXT: popl %ebp 1298; X32-NEXT: retl 1299; 1300; X64-LABEL: test_mm_or_ps: 1301; X64: # BB#0: 1302; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 1303; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1304; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 1305; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 1306; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx 1307; X64-NEXT: movq %rdx, %rsi 1308; X64-NEXT: orl %eax, %edx 1309; X64-NEXT: shrq $32, %rax 1310; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx 1311; X64-NEXT: movq %rcx, %rdi 1312; X64-NEXT: orl %r8d, %ecx 1313; X64-NEXT: shrq $32, %r8 1314; X64-NEXT: shrq $32, %rsi 1315; X64-NEXT: shrq $32, %rdi 1316; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 1317; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) 1318; X64-NEXT: orl %r8d, %edi 1319; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) 1320; X64-NEXT: orl %eax, %esi 1321; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) 1322; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1323; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1324; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1325; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1326; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1327; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1328; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1329; X64-NEXT: retq 1330 %arg0 = bitcast <4 x float> %a0 to <4 x i32> 1331 %arg1 = bitcast <4 x float> %a1 to <4 x i32> 1332 %res = or <4 x i32> %arg0, %arg1 1333 %bc = bitcast <4 x i32> %res to <4 x float> 1334 ret <4 x float> %bc 1335} 1336 1337define void @test_mm_prefetch(i8* %a0) { 1338; X32-LABEL: test_mm_prefetch: 1339; X32: # BB#0: 1340; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1341; X32-NEXT: prefetchnta (%eax) 1342; X32-NEXT: retl 1343; 1344; X64-LABEL: test_mm_prefetch: 1345; X64: # BB#0: 1346; X64-NEXT: prefetchnta (%rdi) 1347; X64-NEXT: retq 1348 call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1) 1349 ret void 1350} 1351declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone 1352 1353define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) { 1354; X32-LABEL: test_mm_rcp_ps: 1355; X32: # BB#0: 1356; X32-NEXT: rcpps %xmm0, %xmm0 1357; X32-NEXT: retl 1358; 1359; X64-LABEL: test_mm_rcp_ps: 1360; X64: # BB#0: 1361; X64-NEXT: rcpps %xmm0, %xmm0 1362; X64-NEXT: retq 1363 %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) 1364 ret <4 x float> %res 1365} 1366declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone 1367 1368define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) { 1369; X32-LABEL: test_mm_rcp_ss: 1370; X32: # BB#0: 1371; X32-NEXT: rcpss %xmm0, %xmm0 1372; X32-NEXT: retl 1373; 1374; X64-LABEL: test_mm_rcp_ss: 1375; X64: # BB#0: 1376; X64-NEXT: rcpss %xmm0, %xmm0 1377; X64-NEXT: retq 1378 %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) 1379 %ext0 = extractelement <4 x float> %rcp, i32 0 1380 %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 1381 %ext1 = extractelement <4 x float> %a0, i32 1 1382 %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 1383 %ext2 = extractelement <4 x float> %a0, i32 2 1384 %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 1385 %ext3 = extractelement <4 x float> %a0, i32 3 1386 %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 1387 ret <4 x float> %ins3 1388} 1389declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone 1390 1391define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) { 1392; X32-LABEL: test_mm_rsqrt_ps: 1393; X32: # BB#0: 1394; X32-NEXT: rsqrtps %xmm0, %xmm0 1395; X32-NEXT: retl 1396; 1397; X64-LABEL: test_mm_rsqrt_ps: 1398; X64: # BB#0: 1399; X64-NEXT: rsqrtps %xmm0, %xmm0 1400; X64-NEXT: retq 1401 %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) 1402 ret <4 x float> %res 1403} 1404declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone 1405 1406define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) { 1407; X32-LABEL: test_mm_rsqrt_ss: 1408; X32: # BB#0: 1409; X32-NEXT: rsqrtss %xmm0, %xmm0 1410; X32-NEXT: retl 1411; 1412; X64-LABEL: test_mm_rsqrt_ss: 1413; X64: # BB#0: 1414; X64-NEXT: rsqrtss %xmm0, %xmm0 1415; X64-NEXT: retq 1416 %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) 1417 %ext0 = extractelement <4 x float> %rsqrt, i32 0 1418 %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 1419 %ext1 = extractelement <4 x float> %a0, i32 1 1420 %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 1421 %ext2 = extractelement <4 x float> %a0, i32 2 1422 %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 1423 %ext3 = extractelement <4 x float> %a0, i32 3 1424 %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 1425 ret <4 x float> %ins3 1426} 1427declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone 1428 1429define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind { 1430; X32-LABEL: test_MM_SET_EXCEPTION_MASK: 1431; X32: # BB#0: 1432; X32-NEXT: pushl %eax 1433; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1434; X32-NEXT: leal (%esp), %ecx 1435; X32-NEXT: stmxcsr (%ecx) 1436; X32-NEXT: movl (%esp), %edx 1437; X32-NEXT: andl $-8065, %edx # imm = 0xE07F 1438; X32-NEXT: orl %eax, %edx 1439; X32-NEXT: movl %edx, (%esp) 1440; X32-NEXT: ldmxcsr (%ecx) 1441; X32-NEXT: popl %eax 1442; X32-NEXT: retl 1443; 1444; X64-LABEL: test_MM_SET_EXCEPTION_MASK: 1445; X64: # BB#0: 1446; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 1447; X64-NEXT: stmxcsr (%rax) 1448; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx 1449; X64-NEXT: andl $-8065, %ecx # imm = 0xE07F 1450; X64-NEXT: orl %edi, %ecx 1451; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 1452; X64-NEXT: ldmxcsr (%rax) 1453; X64-NEXT: retq 1454 %1 = alloca i32, align 4 1455 %2 = bitcast i32* %1 to i8* 1456 call void @llvm.x86.sse.stmxcsr(i8* %2) 1457 %3 = load i32, i32* %1 1458 %4 = and i32 %3, -8065 1459 %5 = or i32 %4, %a0 1460 store i32 %5, i32* %1 1461 call void @llvm.x86.sse.ldmxcsr(i8* %2) 1462 ret void 1463} 1464declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone 1465 1466define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind { 1467; X32-LABEL: test_MM_SET_EXCEPTION_STATE: 1468; X32: # BB#0: 1469; X32-NEXT: pushl %eax 1470; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1471; X32-NEXT: leal (%esp), %ecx 1472; X32-NEXT: stmxcsr (%ecx) 1473; X32-NEXT: movl (%esp), %edx 1474; X32-NEXT: andl $-64, %edx 1475; X32-NEXT: orl %eax, %edx 1476; X32-NEXT: movl %edx, (%esp) 1477; X32-NEXT: ldmxcsr (%ecx) 1478; X32-NEXT: popl %eax 1479; X32-NEXT: retl 1480; 1481; X64-LABEL: test_MM_SET_EXCEPTION_STATE: 1482; X64: # BB#0: 1483; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 1484; X64-NEXT: stmxcsr (%rax) 1485; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx 1486; X64-NEXT: andl $-64, %ecx 1487; X64-NEXT: orl %edi, %ecx 1488; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 1489; X64-NEXT: ldmxcsr (%rax) 1490; X64-NEXT: retq 1491 %1 = alloca i32, align 4 1492 %2 = bitcast i32* %1 to i8* 1493 call void @llvm.x86.sse.stmxcsr(i8* %2) 1494 %3 = load i32, i32* %1 1495 %4 = and i32 %3, -64 1496 %5 = or i32 %4, %a0 1497 store i32 %5, i32* %1 1498 call void @llvm.x86.sse.ldmxcsr(i8* %2) 1499 ret void 1500} 1501 1502define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind { 1503; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE: 1504; X32: # BB#0: 1505; X32-NEXT: pushl %eax 1506; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1507; X32-NEXT: leal (%esp), %ecx 1508; X32-NEXT: stmxcsr (%ecx) 1509; X32-NEXT: movl (%esp), %edx 1510; X32-NEXT: andl $-32769, %edx # imm = 0xFFFF7FFF 1511; X32-NEXT: orl %eax, %edx 1512; X32-NEXT: movl %edx, (%esp) 1513; X32-NEXT: ldmxcsr (%ecx) 1514; X32-NEXT: popl %eax 1515; X32-NEXT: retl 1516; 1517; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE: 1518; X64: # BB#0: 1519; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 1520; X64-NEXT: stmxcsr (%rax) 1521; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx 1522; X64-NEXT: andl $-32769, %ecx # imm = 0xFFFF7FFF 1523; X64-NEXT: orl %edi, %ecx 1524; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 1525; X64-NEXT: ldmxcsr (%rax) 1526; X64-NEXT: retq 1527 %1 = alloca i32, align 4 1528 %2 = bitcast i32* %1 to i8* 1529 call void @llvm.x86.sse.stmxcsr(i8* %2) 1530 %3 = load i32, i32* %1 1531 %4 = and i32 %3, -32769 1532 %5 = or i32 %4, %a0 1533 store i32 %5, i32* %1 1534 call void @llvm.x86.sse.ldmxcsr(i8* %2) 1535 ret void 1536} 1537 1538define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind { 1539; X32-LABEL: test_mm_set_ps: 1540; X32: # BB#0: 1541; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1542; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1543; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1544; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 1545; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1546; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1547; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1548; X32-NEXT: retl 1549; 1550; X64-LABEL: test_mm_set_ps: 1551; X64: # BB#0: 1552; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1553; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1554; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1555; X64-NEXT: movaps %xmm3, %xmm0 1556; X64-NEXT: retq 1557 %res0 = insertelement <4 x float> undef, float %a3, i32 0 1558 %res1 = insertelement <4 x float> %res0, float %a2, i32 1 1559 %res2 = insertelement <4 x float> %res1, float %a1, i32 2 1560 %res3 = insertelement <4 x float> %res2, float %a0, i32 3 1561 ret <4 x float> %res3 1562} 1563 1564define <4 x float> @test_mm_set_ps1(float %a0) nounwind { 1565; X32-LABEL: test_mm_set_ps1: 1566; X32: # BB#0: 1567; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1568; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1569; X32-NEXT: retl 1570; 1571; X64-LABEL: test_mm_set_ps1: 1572; X64: # BB#0: 1573; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1574; X64-NEXT: retq 1575 %res0 = insertelement <4 x float> undef, float %a0, i32 0 1576 %res1 = insertelement <4 x float> %res0, float %a0, i32 1 1577 %res2 = insertelement <4 x float> %res1, float %a0, i32 2 1578 %res3 = insertelement <4 x float> %res2, float %a0, i32 3 1579 ret <4 x float> %res3 1580} 1581 1582define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind { 1583; X32-LABEL: test_MM_SET_ROUNDING_MODE: 1584; X32: # BB#0: 1585; X32-NEXT: pushl %eax 1586; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1587; X32-NEXT: leal (%esp), %ecx 1588; X32-NEXT: stmxcsr (%ecx) 1589; X32-NEXT: movl (%esp), %edx 1590; X32-NEXT: andl $-24577, %edx # imm = 0x9FFF 1591; X32-NEXT: orl %eax, %edx 1592; X32-NEXT: movl %edx, (%esp) 1593; X32-NEXT: ldmxcsr (%ecx) 1594; X32-NEXT: popl %eax 1595; X32-NEXT: retl 1596; 1597; X64-LABEL: test_MM_SET_ROUNDING_MODE: 1598; X64: # BB#0: 1599; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 1600; X64-NEXT: stmxcsr (%rax) 1601; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx 1602; X64-NEXT: andl $-24577, %ecx # imm = 0x9FFF 1603; X64-NEXT: orl %edi, %ecx 1604; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 1605; X64-NEXT: ldmxcsr (%rax) 1606; X64-NEXT: retq 1607 %1 = alloca i32, align 4 1608 %2 = bitcast i32* %1 to i8* 1609 call void @llvm.x86.sse.stmxcsr(i8* %2) 1610 %3 = load i32, i32* %1 1611 %4 = and i32 %3, -24577 1612 %5 = or i32 %4, %a0 1613 store i32 %5, i32* %1 1614 call void @llvm.x86.sse.ldmxcsr(i8* %2) 1615 ret void 1616} 1617 1618define <4 x float> @test_mm_set_ss(float %a0) nounwind { 1619; X32-LABEL: test_mm_set_ss: 1620; X32: # BB#0: 1621; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1622; X32-NEXT: xorps %xmm0, %xmm0 1623; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1624; X32-NEXT: retl 1625; 1626; X64-LABEL: test_mm_set_ss: 1627; X64: # BB#0: 1628; X64-NEXT: xorps %xmm1, %xmm1 1629; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1630; X64-NEXT: movaps %xmm1, %xmm0 1631; X64-NEXT: retq 1632 %res0 = insertelement <4 x float> undef, float %a0, i32 0 1633 %res1 = insertelement <4 x float> %res0, float 0.0, i32 1 1634 %res2 = insertelement <4 x float> %res1, float 0.0, i32 2 1635 %res3 = insertelement <4 x float> %res2, float 0.0, i32 3 1636 ret <4 x float> %res3 1637} 1638 1639define <4 x float> @test_mm_set1_ps(float %a0) nounwind { 1640; X32-LABEL: test_mm_set1_ps: 1641; X32: # BB#0: 1642; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1643; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1644; X32-NEXT: retl 1645; 1646; X64-LABEL: test_mm_set1_ps: 1647; X64: # BB#0: 1648; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1649; X64-NEXT: retq 1650 %res0 = insertelement <4 x float> undef, float %a0, i32 0 1651 %res1 = insertelement <4 x float> %res0, float %a0, i32 1 1652 %res2 = insertelement <4 x float> %res1, float %a0, i32 2 1653 %res3 = insertelement <4 x float> %res2, float %a0, i32 3 1654 ret <4 x float> %res3 1655} 1656 1657define void @test_mm_setcsr(i32 %a0) nounwind { 1658; X32-LABEL: test_mm_setcsr: 1659; X32: # BB#0: 1660; X32-NEXT: pushl %eax 1661; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1662; X32-NEXT: leal (%esp), %ecx 1663; X32-NEXT: movl %eax, (%esp) 1664; X32-NEXT: ldmxcsr (%ecx) 1665; X32-NEXT: popl %eax 1666; X32-NEXT: retl 1667; 1668; X64-LABEL: test_mm_setcsr: 1669; X64: # BB#0: 1670; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax 1671; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) 1672; X64-NEXT: ldmxcsr (%rax) 1673; X64-NEXT: retq 1674 %st = alloca i32, align 4 1675 store i32 %a0, i32* %st, align 4 1676 %bc = bitcast i32* %st to i8* 1677 call void @llvm.x86.sse.ldmxcsr(i8* %bc) 1678 ret void 1679} 1680 1681define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { 1682; X32-LABEL: test_mm_setr_ps: 1683; X32: # BB#0: 1684; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1685; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1686; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 1687; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1688; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1689; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1690; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 1691; X32-NEXT: retl 1692; 1693; X64-LABEL: test_mm_setr_ps: 1694; X64: # BB#0: 1695; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1696; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1697; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1698; X64-NEXT: retq 1699 %res0 = insertelement <4 x float> undef, float %a0, i32 0 1700 %res1 = insertelement <4 x float> %res0, float %a1, i32 1 1701 %res2 = insertelement <4 x float> %res1, float %a2, i32 2 1702 %res3 = insertelement <4 x float> %res2, float %a3, i32 3 1703 ret <4 x float> %res3 1704} 1705 1706define <4 x float> @test_mm_setzero_ps() { 1707; X32-LABEL: test_mm_setzero_ps: 1708; X32: # BB#0: 1709; X32-NEXT: xorps %xmm0, %xmm0 1710; X32-NEXT: retl 1711; 1712; X64-LABEL: test_mm_setzero_ps: 1713; X64: # BB#0: 1714; X64-NEXT: xorps %xmm0, %xmm0 1715; X64-NEXT: retq 1716 ret <4 x float> zeroinitializer 1717} 1718 1719define void @test_mm_sfence() nounwind { 1720; X32-LABEL: test_mm_sfence: 1721; X32: # BB#0: 1722; X32-NEXT: sfence 1723; X32-NEXT: retl 1724; 1725; X64-LABEL: test_mm_sfence: 1726; X64: # BB#0: 1727; X64-NEXT: sfence 1728; X64-NEXT: retq 1729 call void @llvm.x86.sse.sfence() 1730 ret void 1731} 1732declare void @llvm.x86.sse.sfence() nounwind readnone 1733 1734define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 1735; X32-LABEL: test_mm_shuffle_ps: 1736; X32: # BB#0: 1737; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] 1738; X32-NEXT: retl 1739; 1740; X64-LABEL: test_mm_shuffle_ps: 1741; X64: # BB#0: 1742; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] 1743; X64-NEXT: retq 1744 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4> 1745 ret <4 x float> %res 1746} 1747 1748define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) { 1749; X32-LABEL: test_mm_sqrt_ps: 1750; X32: # BB#0: 1751; X32-NEXT: sqrtps %xmm0, %xmm0 1752; X32-NEXT: retl 1753; 1754; X64-LABEL: test_mm_sqrt_ps: 1755; X64: # BB#0: 1756; X64-NEXT: sqrtps %xmm0, %xmm0 1757; X64-NEXT: retq 1758 %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) 1759 ret <4 x float> %res 1760} 1761declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone 1762 1763define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) { 1764; X32-LABEL: test_mm_sqrt_ss: 1765; X32: # BB#0: 1766; X32-NEXT: sqrtss %xmm0, %xmm0 1767; X32-NEXT: retl 1768; 1769; X64-LABEL: test_mm_sqrt_ss: 1770; X64: # BB#0: 1771; X64-NEXT: sqrtss %xmm0, %xmm0 1772; X64-NEXT: retq 1773 %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) 1774 %ext0 = extractelement <4 x float> %sqrt, i32 0 1775 %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 1776 %ext1 = extractelement <4 x float> %a0, i32 1 1777 %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 1778 %ext2 = extractelement <4 x float> %a0, i32 2 1779 %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 1780 %ext3 = extractelement <4 x float> %a0, i32 3 1781 %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 1782 ret <4 x float> %ins3 1783} 1784declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone 1785 1786define void @test_mm_store_ps(float *%a0, <4 x float> %a1) { 1787; X32-LABEL: test_mm_store_ps: 1788; X32: # BB#0: 1789; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1790; X32-NEXT: movaps %xmm0, (%eax) 1791; X32-NEXT: retl 1792; 1793; X64-LABEL: test_mm_store_ps: 1794; X64: # BB#0: 1795; X64-NEXT: movaps %xmm0, (%rdi) 1796; X64-NEXT: retq 1797 %arg0 = bitcast float* %a0 to <4 x float>* 1798 store <4 x float> %a1, <4 x float>* %arg0, align 16 1799 ret void 1800} 1801 1802define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) { 1803; X32-LABEL: test_mm_store_ps1: 1804; X32: # BB#0: 1805; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1806; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1807; X32-NEXT: movaps %xmm0, (%eax) 1808; X32-NEXT: retl 1809; 1810; X64-LABEL: test_mm_store_ps1: 1811; X64: # BB#0: 1812; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1813; X64-NEXT: movaps %xmm0, (%rdi) 1814; X64-NEXT: retq 1815 %arg0 = bitcast float* %a0 to <4 x float>* 1816 %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer 1817 store <4 x float> %shuf, <4 x float>* %arg0, align 16 1818 ret void 1819} 1820 1821define void @test_mm_store_ss(float *%a0, <4 x float> %a1) { 1822; X32-LABEL: test_mm_store_ss: 1823; X32: # BB#0: 1824; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1825; X32-NEXT: movss %xmm0, (%eax) 1826; X32-NEXT: retl 1827; 1828; X64-LABEL: test_mm_store_ss: 1829; X64: # BB#0: 1830; X64-NEXT: movss %xmm0, (%rdi) 1831; X64-NEXT: retq 1832 %ext = extractelement <4 x float> %a1, i32 0 1833 store float %ext, float* %a0, align 1 1834 ret void 1835} 1836 1837define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) { 1838; X32-LABEL: test_mm_store1_ps: 1839; X32: # BB#0: 1840; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1841; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1842; X32-NEXT: movaps %xmm0, (%eax) 1843; X32-NEXT: retl 1844; 1845; X64-LABEL: test_mm_store1_ps: 1846; X64: # BB#0: 1847; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1848; X64-NEXT: movaps %xmm0, (%rdi) 1849; X64-NEXT: retq 1850 %arg0 = bitcast float* %a0 to <4 x float>* 1851 %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer 1852 store <4 x float> %shuf, <4 x float>* %arg0, align 16 1853 ret void 1854} 1855 1856define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { 1857; X32-LABEL: test_mm_storeh_ps: 1858; X32: # BB#0: 1859; X32-NEXT: pushl %ebp 1860; X32-NEXT: movl %esp, %ebp 1861; X32-NEXT: andl $-16, %esp 1862; X32-NEXT: subl $32, %esp 1863; X32-NEXT: movl 8(%ebp), %eax 1864; X32-NEXT: movaps %xmm0, (%esp) 1865; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 1866; X32-NEXT: movl {{[0-9]+}}(%esp), %edx 1867; X32-NEXT: movl %edx, 4(%eax) 1868; X32-NEXT: movl %ecx, (%eax) 1869; X32-NEXT: movl %ebp, %esp 1870; X32-NEXT: popl %ebp 1871; X32-NEXT: retl 1872; 1873; X64-LABEL: test_mm_storeh_ps: 1874; X64: # BB#0: 1875; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 1876; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1877; X64-NEXT: movq %rax, (%rdi) 1878; X64-NEXT: retq 1879 %ptr = bitcast x86_mmx* %a0 to i64* 1880 %bc = bitcast <4 x float> %a1 to <2 x i64> 1881 %ext = extractelement <2 x i64> %bc, i32 1 1882 store i64 %ext, i64* %ptr 1883 ret void 1884} 1885 1886define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind { 1887; X32-LABEL: test_mm_storel_ps: 1888; X32: # BB#0: 1889; X32-NEXT: pushl %ebp 1890; X32-NEXT: movl %esp, %ebp 1891; X32-NEXT: andl $-16, %esp 1892; X32-NEXT: subl $32, %esp 1893; X32-NEXT: movl 8(%ebp), %eax 1894; X32-NEXT: movaps %xmm0, (%esp) 1895; X32-NEXT: movl (%esp), %ecx 1896; X32-NEXT: movl {{[0-9]+}}(%esp), %edx 1897; X32-NEXT: movl %edx, 4(%eax) 1898; X32-NEXT: movl %ecx, (%eax) 1899; X32-NEXT: movl %ebp, %esp 1900; X32-NEXT: popl %ebp 1901; X32-NEXT: retl 1902; 1903; X64-LABEL: test_mm_storel_ps: 1904; X64: # BB#0: 1905; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 1906; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax 1907; X64-NEXT: movq %rax, (%rdi) 1908; X64-NEXT: retq 1909 %ptr = bitcast x86_mmx* %a0 to i64* 1910 %bc = bitcast <4 x float> %a1 to <2 x i64> 1911 %ext = extractelement <2 x i64> %bc, i32 0 1912 store i64 %ext, i64* %ptr 1913 ret void 1914} 1915 1916define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) { 1917; X32-LABEL: test_mm_storer_ps: 1918; X32: # BB#0: 1919; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1920; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 1921; X32-NEXT: movaps %xmm0, (%eax) 1922; X32-NEXT: retl 1923; 1924; X64-LABEL: test_mm_storer_ps: 1925; X64: # BB#0: 1926; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 1927; X64-NEXT: movaps %xmm0, (%rdi) 1928; X64-NEXT: retq 1929 %arg0 = bitcast float* %a0 to <4 x float>* 1930 %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1931 store <4 x float> %shuf, <4 x float>* %arg0, align 16 1932 ret void 1933} 1934 1935define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) { 1936; X32-LABEL: test_mm_storeu_ps: 1937; X32: # BB#0: 1938; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1939; X32-NEXT: movups %xmm0, (%eax) 1940; X32-NEXT: retl 1941; 1942; X64-LABEL: test_mm_storeu_ps: 1943; X64: # BB#0: 1944; X64-NEXT: movups %xmm0, (%rdi) 1945; X64-NEXT: retq 1946 %arg0 = bitcast float* %a0 to <4 x float>* 1947 store <4 x float> %a1, <4 x float>* %arg0, align 1 1948 ret void 1949} 1950 1951define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) { 1952; X32-LABEL: test_mm_stream_ps: 1953; X32: # BB#0: 1954; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 1955; X32-NEXT: movntps %xmm0, (%eax) 1956; X32-NEXT: retl 1957; 1958; X64-LABEL: test_mm_stream_ps: 1959; X64: # BB#0: 1960; X64-NEXT: movntps %xmm0, (%rdi) 1961; X64-NEXT: retq 1962 %arg0 = bitcast float* %a0 to <4 x float>* 1963 store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0 1964 ret void 1965} 1966 1967define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 1968; X32-LABEL: test_mm_sub_ps: 1969; X32: # BB#0: 1970; X32-NEXT: subps %xmm1, %xmm0 1971; X32-NEXT: retl 1972; 1973; X64-LABEL: test_mm_sub_ps: 1974; X64: # BB#0: 1975; X64-NEXT: subps %xmm1, %xmm0 1976; X64-NEXT: retq 1977 %res = fsub <4 x float> %a0, %a1 1978 ret <4 x float> %res 1979} 1980 1981define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 1982; X32-LABEL: test_mm_sub_ss: 1983; X32: # BB#0: 1984; X32-NEXT: subss %xmm1, %xmm0 1985; X32-NEXT: retl 1986; 1987; X64-LABEL: test_mm_sub_ss: 1988; X64: # BB#0: 1989; X64-NEXT: subss %xmm1, %xmm0 1990; X64-NEXT: retq 1991 %ext0 = extractelement <4 x float> %a0, i32 0 1992 %ext1 = extractelement <4 x float> %a1, i32 0 1993 %fsub = fsub float %ext0, %ext1 1994 %res = insertelement <4 x float> %a0, float %fsub, i32 0 1995 ret <4 x float> %res 1996} 1997 1998define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind { 1999; X32-LABEL: test_MM_TRANSPOSE4_PS: 2000; X32: # BB#0: 2001; X32-NEXT: pushl %esi 2002; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 2003; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 2004; X32-NEXT: movl {{[0-9]+}}(%esp), %edx 2005; X32-NEXT: movl {{[0-9]+}}(%esp), %esi 2006; X32-NEXT: movaps (%esi), %xmm0 2007; X32-NEXT: movaps (%edx), %xmm1 2008; X32-NEXT: movaps (%ecx), %xmm2 2009; X32-NEXT: movaps (%eax), %xmm3 2010; X32-NEXT: movaps %xmm0, %xmm4 2011; X32-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2012; X32-NEXT: movaps %xmm2, %xmm5 2013; X32-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 2014; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2015; X32-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 2016; X32-NEXT: movaps %xmm4, %xmm1 2017; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] 2018; X32-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] 2019; X32-NEXT: movaps %xmm0, %xmm3 2020; X32-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] 2021; X32-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 2022; X32-NEXT: movaps %xmm1, (%esi) 2023; X32-NEXT: movaps %xmm5, (%edx) 2024; X32-NEXT: movaps %xmm3, (%ecx) 2025; X32-NEXT: movaps %xmm2, (%eax) 2026; X32-NEXT: popl %esi 2027; X32-NEXT: retl 2028; 2029; X64-LABEL: test_MM_TRANSPOSE4_PS: 2030; X64: # BB#0: 2031; X64-NEXT: movaps (%rdi), %xmm0 2032; X64-NEXT: movaps (%rsi), %xmm1 2033; X64-NEXT: movaps (%rdx), %xmm2 2034; X64-NEXT: movaps (%rcx), %xmm3 2035; X64-NEXT: movaps %xmm0, %xmm4 2036; X64-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2037; X64-NEXT: movaps %xmm2, %xmm5 2038; X64-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 2039; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2040; X64-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 2041; X64-NEXT: movaps %xmm4, %xmm1 2042; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] 2043; X64-NEXT: movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1] 2044; X64-NEXT: movaps %xmm0, %xmm3 2045; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] 2046; X64-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 2047; X64-NEXT: movaps %xmm1, (%rdi) 2048; X64-NEXT: movaps %xmm5, (%rsi) 2049; X64-NEXT: movaps %xmm3, (%rdx) 2050; X64-NEXT: movaps %xmm2, (%rcx) 2051; X64-NEXT: retq 2052 %row0 = load <4 x float>, <4 x float>* %a0, align 16 2053 %row1 = load <4 x float>, <4 x float>* %a1, align 16 2054 %row2 = load <4 x float>, <4 x float>* %a2, align 16 2055 %row3 = load <4 x float>, <4 x float>* %a3, align 16 2056 %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2057 %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2058 %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 2059 %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 2060 %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 2061 %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 2062 %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 2063 %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 2064 store <4 x float> %res0, <4 x float>* %a0, align 16 2065 store <4 x float> %res1, <4 x float>* %a1, align 16 2066 store <4 x float> %res2, <4 x float>* %a2, align 16 2067 store <4 x float> %res3, <4 x float>* %a3, align 16 2068 ret void 2069} 2070 2071define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 2072; X32-LABEL: test_mm_ucomieq_ss: 2073; X32: # BB#0: 2074; X32-NEXT: ucomiss %xmm1, %xmm0 2075; X32-NEXT: setnp %al 2076; X32-NEXT: sete %cl 2077; X32-NEXT: andb %al, %cl 2078; X32-NEXT: movzbl %cl, %eax 2079; X32-NEXT: retl 2080; 2081; X64-LABEL: test_mm_ucomieq_ss: 2082; X64: # BB#0: 2083; X64-NEXT: ucomiss %xmm1, %xmm0 2084; X64-NEXT: setnp %al 2085; X64-NEXT: sete %cl 2086; X64-NEXT: andb %al, %cl 2087; X64-NEXT: movzbl %cl, %eax 2088; X64-NEXT: retq 2089 %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) 2090 ret i32 %res 2091} 2092declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone 2093 2094define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 2095; X32-LABEL: test_mm_ucomige_ss: 2096; X32: # BB#0: 2097; X32-NEXT: xorl %eax, %eax 2098; X32-NEXT: ucomiss %xmm1, %xmm0 2099; X32-NEXT: setae %al 2100; X32-NEXT: retl 2101; 2102; X64-LABEL: test_mm_ucomige_ss: 2103; X64: # BB#0: 2104; X64-NEXT: xorl %eax, %eax 2105; X64-NEXT: ucomiss %xmm1, %xmm0 2106; X64-NEXT: setae %al 2107; X64-NEXT: retq 2108 %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) 2109 ret i32 %res 2110} 2111declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone 2112 2113define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 2114; X32-LABEL: test_mm_ucomigt_ss: 2115; X32: # BB#0: 2116; X32-NEXT: xorl %eax, %eax 2117; X32-NEXT: ucomiss %xmm1, %xmm0 2118; X32-NEXT: seta %al 2119; X32-NEXT: retl 2120; 2121; X64-LABEL: test_mm_ucomigt_ss: 2122; X64: # BB#0: 2123; X64-NEXT: xorl %eax, %eax 2124; X64-NEXT: ucomiss %xmm1, %xmm0 2125; X64-NEXT: seta %al 2126; X64-NEXT: retq 2127 %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) 2128 ret i32 %res 2129} 2130declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone 2131 2132define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 2133; X32-LABEL: test_mm_ucomile_ss: 2134; X32: # BB#0: 2135; X32-NEXT: xorl %eax, %eax 2136; X32-NEXT: ucomiss %xmm0, %xmm1 2137; X32-NEXT: setae %al 2138; X32-NEXT: retl 2139; 2140; X64-LABEL: test_mm_ucomile_ss: 2141; X64: # BB#0: 2142; X64-NEXT: xorl %eax, %eax 2143; X64-NEXT: ucomiss %xmm0, %xmm1 2144; X64-NEXT: setae %al 2145; X64-NEXT: retq 2146 %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) 2147 ret i32 %res 2148} 2149declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone 2150 2151define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 2152; X32-LABEL: test_mm_ucomilt_ss: 2153; X32: # BB#0: 2154; X32-NEXT: xorl %eax, %eax 2155; X32-NEXT: ucomiss %xmm0, %xmm1 2156; X32-NEXT: seta %al 2157; X32-NEXT: retl 2158; 2159; X64-LABEL: test_mm_ucomilt_ss: 2160; X64: # BB#0: 2161; X64-NEXT: xorl %eax, %eax 2162; X64-NEXT: ucomiss %xmm0, %xmm1 2163; X64-NEXT: seta %al 2164; X64-NEXT: retq 2165 %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) 2166 ret i32 %res 2167} 2168declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone 2169 2170define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 2171; X32-LABEL: test_mm_ucomineq_ss: 2172; X32: # BB#0: 2173; X32-NEXT: ucomiss %xmm1, %xmm0 2174; X32-NEXT: setp %al 2175; X32-NEXT: setne %cl 2176; X32-NEXT: orb %al, %cl 2177; X32-NEXT: movzbl %cl, %eax 2178; X32-NEXT: retl 2179; 2180; X64-LABEL: test_mm_ucomineq_ss: 2181; X64: # BB#0: 2182; X64-NEXT: ucomiss %xmm1, %xmm0 2183; X64-NEXT: setp %al 2184; X64-NEXT: setne %cl 2185; X64-NEXT: orb %al, %cl 2186; X64-NEXT: movzbl %cl, %eax 2187; X64-NEXT: retq 2188 %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) 2189 ret i32 %res 2190} 2191declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone 2192 2193define <4 x float> @test_mm_undefined_ps() { 2194; X32-LABEL: test_mm_undefined_ps: 2195; X32: # BB#0: 2196; X32-NEXT: retl 2197; 2198; X64-LABEL: test_mm_undefined_ps: 2199; X64: # BB#0: 2200; X64-NEXT: retq 2201 ret <4 x float> undef 2202} 2203 2204define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2205; X32-LABEL: test_mm_unpackhi_ps: 2206; X32: # BB#0: 2207; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2208; X32-NEXT: retl 2209; 2210; X64-LABEL: test_mm_unpackhi_ps: 2211; X64: # BB#0: 2212; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2213; X64-NEXT: retq 2214 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 2215 ret <4 x float> %res 2216} 2217 2218define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2219; X32-LABEL: test_mm_unpacklo_ps: 2220; X32: # BB#0: 2221; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2222; X32-NEXT: retl 2223; 2224; X64-LABEL: test_mm_unpacklo_ps: 2225; X64: # BB#0: 2226; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2227; X64-NEXT: retq 2228 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2229 ret <4 x float> %res 2230} 2231 2232define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2233; X32-LABEL: test_mm_xor_ps: 2234; X32: # BB#0: 2235; X32-NEXT: pushl %ebp 2236; X32-NEXT: movl %esp, %ebp 2237; X32-NEXT: pushl %esi 2238; X32-NEXT: andl $-16, %esp 2239; X32-NEXT: subl $64, %esp 2240; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) 2241; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 2242; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 2243; X32-NEXT: movl {{[0-9]+}}(%esp), %edx 2244; X32-NEXT: movl {{[0-9]+}}(%esp), %esi 2245; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) 2246; X32-NEXT: xorl {{[0-9]+}}(%esp), %esi 2247; X32-NEXT: movl %esi, (%esp) 2248; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx 2249; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) 2250; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax 2251; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) 2252; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx 2253; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) 2254; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2255; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2256; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2257; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2258; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2259; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2260; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2261; X32-NEXT: leal -4(%ebp), %esp 2262; X32-NEXT: popl %esi 2263; X32-NEXT: popl %ebp 2264; X32-NEXT: retl 2265; 2266; X64-LABEL: test_mm_xor_ps: 2267; X64: # BB#0: 2268; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 2269; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax 2270; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 2271; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 2272; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx 2273; X64-NEXT: movq %rdx, %rsi 2274; X64-NEXT: xorl %eax, %edx 2275; X64-NEXT: shrq $32, %rax 2276; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx 2277; X64-NEXT: movq %rcx, %rdi 2278; X64-NEXT: xorl %r8d, %ecx 2279; X64-NEXT: shrq $32, %r8 2280; X64-NEXT: shrq $32, %rsi 2281; X64-NEXT: shrq $32, %rdi 2282; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) 2283; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) 2284; X64-NEXT: xorl %r8d, %edi 2285; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) 2286; X64-NEXT: xorl %eax, %esi 2287; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) 2288; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2289; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2290; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2291; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2292; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2293; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2294; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2295; X64-NEXT: retq 2296 %arg0 = bitcast <4 x float> %a0 to <4 x i32> 2297 %arg1 = bitcast <4 x float> %a1 to <4 x i32> 2298 %res = xor <4 x i32> %arg0, %arg1 2299 %bc = bitcast <4 x i32> %res to <4 x float> 2300 ret <4 x float> %bc 2301} 2302 2303!0 = !{i32 1} 2304