1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AVX 4 5declare float @fmaxf(float, float) 6declare double @fmax(double, double) 7declare x86_fp80 @fmaxl(x86_fp80, x86_fp80) 8declare float @llvm.maxnum.f32(float, float) 9declare double @llvm.maxnum.f64(double, double) 10declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) 11 12declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) 13declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) 14declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) 15declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) 16declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) 17 18; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. 19 20define float @test_fmaxf(float %x, float %y) { 21; SSE-LABEL: test_fmaxf: 22; SSE: # %bb.0: 23; SSE-NEXT: movaps %xmm0, %xmm2 24; SSE-NEXT: cmpunordss %xmm0, %xmm2 25; SSE-NEXT: movaps %xmm2, %xmm3 26; SSE-NEXT: andps %xmm1, %xmm3 27; SSE-NEXT: maxss %xmm0, %xmm1 28; SSE-NEXT: andnps %xmm1, %xmm2 29; SSE-NEXT: orps %xmm3, %xmm2 30; SSE-NEXT: movaps %xmm2, %xmm0 31; SSE-NEXT: retq 32; 33; AVX-LABEL: test_fmaxf: 34; AVX: # %bb.0: 35; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 36; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 37; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 38; AVX-NEXT: retq 39 %z = call float @fmaxf(float %x, float %y) readnone 40 ret float %z 41} 42 43define float @test_fmaxf_minsize(float %x, float %y) minsize { 44; CHECK-LABEL: test_fmaxf_minsize: 45; CHECK: # %bb.0: 46; CHECK-NEXT: jmp fmaxf # TAILCALL 47 %z = call float @fmaxf(float %x, float %y) readnone 48 ret float %z 49} 50 51; FIXME: As the vector tests show, the SSE run shouldn't need this many moves. 52 53define double @test_fmax(double %x, double %y) { 54; SSE-LABEL: test_fmax: 55; SSE: # %bb.0: 56; SSE-NEXT: movapd %xmm0, %xmm2 57; SSE-NEXT: cmpunordsd %xmm0, %xmm2 58; SSE-NEXT: movapd %xmm2, %xmm3 59; SSE-NEXT: andpd %xmm1, %xmm3 60; SSE-NEXT: maxsd %xmm0, %xmm1 61; SSE-NEXT: andnpd %xmm1, %xmm2 62; SSE-NEXT: orpd %xmm3, %xmm2 63; SSE-NEXT: movapd %xmm2, %xmm0 64; SSE-NEXT: retq 65; 66; AVX-LABEL: test_fmax: 67; AVX: # %bb.0: 68; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 69; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 70; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 71; AVX-NEXT: retq 72 %z = call double @fmax(double %x, double %y) readnone 73 ret double %z 74} 75 76define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) { 77; CHECK-LABEL: test_fmaxl: 78; CHECK: # %bb.0: 79; CHECK-NEXT: subq $40, %rsp 80; CHECK-NEXT: .cfi_def_cfa_offset 48 81; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 82; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 83; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) 84; CHECK-NEXT: fstpt (%rsp) 85; CHECK-NEXT: callq fmaxl 86; CHECK-NEXT: addq $40, %rsp 87; CHECK-NEXT: .cfi_def_cfa_offset 8 88; CHECK-NEXT: retq 89 %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone 90 ret x86_fp80 %z 91} 92 93define float @test_intrinsic_fmaxf(float %x, float %y) { 94; SSE-LABEL: test_intrinsic_fmaxf: 95; SSE: # %bb.0: 96; SSE-NEXT: movaps %xmm0, %xmm2 97; SSE-NEXT: cmpunordss %xmm0, %xmm2 98; SSE-NEXT: movaps %xmm2, %xmm3 99; SSE-NEXT: andps %xmm1, %xmm3 100; SSE-NEXT: maxss %xmm0, %xmm1 101; SSE-NEXT: andnps %xmm1, %xmm2 102; SSE-NEXT: orps %xmm3, %xmm2 103; SSE-NEXT: movaps %xmm2, %xmm0 104; SSE-NEXT: retq 105; 106; AVX-LABEL: test_intrinsic_fmaxf: 107; AVX: # %bb.0: 108; AVX-NEXT: vmaxss %xmm0, %xmm1, %xmm2 109; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 110; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 111; AVX-NEXT: retq 112 %z = call float @llvm.maxnum.f32(float %x, float %y) readnone 113 ret float %z 114} 115 116define double @test_intrinsic_fmax(double %x, double %y) { 117; SSE-LABEL: test_intrinsic_fmax: 118; SSE: # %bb.0: 119; SSE-NEXT: movapd %xmm0, %xmm2 120; SSE-NEXT: cmpunordsd %xmm0, %xmm2 121; SSE-NEXT: movapd %xmm2, %xmm3 122; SSE-NEXT: andpd %xmm1, %xmm3 123; SSE-NEXT: maxsd %xmm0, %xmm1 124; SSE-NEXT: andnpd %xmm1, %xmm2 125; SSE-NEXT: orpd %xmm3, %xmm2 126; SSE-NEXT: movapd %xmm2, %xmm0 127; SSE-NEXT: retq 128; 129; AVX-LABEL: test_intrinsic_fmax: 130; AVX: # %bb.0: 131; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 132; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 133; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 134; AVX-NEXT: retq 135 %z = call double @llvm.maxnum.f64(double %x, double %y) readnone 136 ret double %z 137} 138 139define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) { 140; CHECK-LABEL: test_intrinsic_fmaxl: 141; CHECK: # %bb.0: 142; CHECK-NEXT: subq $40, %rsp 143; CHECK-NEXT: .cfi_def_cfa_offset 48 144; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 145; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 146; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) 147; CHECK-NEXT: fstpt (%rsp) 148; CHECK-NEXT: callq fmaxl 149; CHECK-NEXT: addq $40, %rsp 150; CHECK-NEXT: .cfi_def_cfa_offset 8 151; CHECK-NEXT: retq 152 %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone 153 ret x86_fp80 %z 154} 155 156define <2 x float> @test_intrinsic_fmax_v2f32(<2 x float> %x, <2 x float> %y) { 157; SSE-LABEL: test_intrinsic_fmax_v2f32: 158; SSE: # %bb.0: 159; SSE-NEXT: movaps %xmm1, %xmm2 160; SSE-NEXT: maxps %xmm0, %xmm2 161; SSE-NEXT: cmpunordps %xmm0, %xmm0 162; SSE-NEXT: andps %xmm0, %xmm1 163; SSE-NEXT: andnps %xmm2, %xmm0 164; SSE-NEXT: orps %xmm1, %xmm0 165; SSE-NEXT: retq 166; 167; AVX-LABEL: test_intrinsic_fmax_v2f32: 168; AVX: # %bb.0: 169; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm2 170; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 171; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 172; AVX-NEXT: retq 173 %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone 174 ret <2 x float> %z 175} 176 177define <4 x float> @test_intrinsic_fmax_v4f32(<4 x float> %x, <4 x float> %y) { 178; SSE-LABEL: test_intrinsic_fmax_v4f32: 179; SSE: # %bb.0: 180; SSE-NEXT: movaps %xmm1, %xmm2 181; SSE-NEXT: maxps %xmm0, %xmm2 182; SSE-NEXT: cmpunordps %xmm0, %xmm0 183; SSE-NEXT: andps %xmm0, %xmm1 184; SSE-NEXT: andnps %xmm2, %xmm0 185; SSE-NEXT: orps %xmm1, %xmm0 186; SSE-NEXT: retq 187; 188; AVX-LABEL: test_intrinsic_fmax_v4f32: 189; AVX: # %bb.0: 190; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm2 191; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 192; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 193; AVX-NEXT: retq 194 %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone 195 ret <4 x float> %z 196} 197 198define <2 x double> @test_intrinsic_fmax_v2f64(<2 x double> %x, <2 x double> %y) { 199; SSE-LABEL: test_intrinsic_fmax_v2f64: 200; SSE: # %bb.0: 201; SSE-NEXT: movapd %xmm1, %xmm2 202; SSE-NEXT: maxpd %xmm0, %xmm2 203; SSE-NEXT: cmpunordpd %xmm0, %xmm0 204; SSE-NEXT: andpd %xmm0, %xmm1 205; SSE-NEXT: andnpd %xmm2, %xmm0 206; SSE-NEXT: orpd %xmm1, %xmm0 207; SSE-NEXT: retq 208; 209; AVX-LABEL: test_intrinsic_fmax_v2f64: 210; AVX: # %bb.0: 211; AVX-NEXT: vmaxpd %xmm0, %xmm1, %xmm2 212; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm0 213; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 214; AVX-NEXT: retq 215 %z = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> %y) readnone 216 ret <2 x double> %z 217} 218 219define <4 x double> @test_intrinsic_fmax_v4f64(<4 x double> %x, <4 x double> %y) { 220; SSE-LABEL: test_intrinsic_fmax_v4f64: 221; SSE: # %bb.0: 222; SSE-NEXT: movapd %xmm2, %xmm4 223; SSE-NEXT: maxpd %xmm0, %xmm4 224; SSE-NEXT: cmpunordpd %xmm0, %xmm0 225; SSE-NEXT: andpd %xmm0, %xmm2 226; SSE-NEXT: andnpd %xmm4, %xmm0 227; SSE-NEXT: orpd %xmm2, %xmm0 228; SSE-NEXT: movapd %xmm3, %xmm2 229; SSE-NEXT: maxpd %xmm1, %xmm2 230; SSE-NEXT: cmpunordpd %xmm1, %xmm1 231; SSE-NEXT: andpd %xmm1, %xmm3 232; SSE-NEXT: andnpd %xmm2, %xmm1 233; SSE-NEXT: orpd %xmm3, %xmm1 234; SSE-NEXT: retq 235; 236; AVX-LABEL: test_intrinsic_fmax_v4f64: 237; AVX: # %bb.0: 238; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2 239; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 240; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 241; AVX-NEXT: retq 242 %z = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone 243 ret <4 x double> %z 244} 245 246define <8 x double> @test_intrinsic_fmax_v8f64(<8 x double> %x, <8 x double> %y) { 247; SSE-LABEL: test_intrinsic_fmax_v8f64: 248; SSE: # %bb.0: 249; SSE-NEXT: movapd %xmm4, %xmm8 250; SSE-NEXT: maxpd %xmm0, %xmm8 251; SSE-NEXT: cmpunordpd %xmm0, %xmm0 252; SSE-NEXT: andpd %xmm0, %xmm4 253; SSE-NEXT: andnpd %xmm8, %xmm0 254; SSE-NEXT: orpd %xmm4, %xmm0 255; SSE-NEXT: movapd %xmm5, %xmm4 256; SSE-NEXT: maxpd %xmm1, %xmm4 257; SSE-NEXT: cmpunordpd %xmm1, %xmm1 258; SSE-NEXT: andpd %xmm1, %xmm5 259; SSE-NEXT: andnpd %xmm4, %xmm1 260; SSE-NEXT: orpd %xmm5, %xmm1 261; SSE-NEXT: movapd %xmm6, %xmm4 262; SSE-NEXT: maxpd %xmm2, %xmm4 263; SSE-NEXT: cmpunordpd %xmm2, %xmm2 264; SSE-NEXT: andpd %xmm2, %xmm6 265; SSE-NEXT: andnpd %xmm4, %xmm2 266; SSE-NEXT: orpd %xmm6, %xmm2 267; SSE-NEXT: movapd %xmm7, %xmm4 268; SSE-NEXT: maxpd %xmm3, %xmm4 269; SSE-NEXT: cmpunordpd %xmm3, %xmm3 270; SSE-NEXT: andpd %xmm3, %xmm7 271; SSE-NEXT: andnpd %xmm4, %xmm3 272; SSE-NEXT: orpd %xmm7, %xmm3 273; SSE-NEXT: retq 274; 275; AVX-LABEL: test_intrinsic_fmax_v8f64: 276; AVX: # %bb.0: 277; AVX-NEXT: vmaxpd %ymm0, %ymm2, %ymm4 278; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0 279; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0 280; AVX-NEXT: vmaxpd %ymm1, %ymm3, %ymm2 281; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1 282; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1 283; AVX-NEXT: retq 284 %z = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %x, <8 x double> %y) readnone 285 ret <8 x double> %z 286} 287 288; The IR-level FMF propagate to the node. With nnan, there's no need to blend. 289 290define double @maxnum_intrinsic_nnan_fmf_f64(double %a, double %b) { 291; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f64: 292; SSE: # %bb.0: 293; SSE-NEXT: maxsd %xmm1, %xmm0 294; SSE-NEXT: retq 295; 296; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f64: 297; AVX: # %bb.0: 298; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 299; AVX-NEXT: retq 300 %r = tail call nnan double @llvm.maxnum.f64(double %a, double %b) 301 ret double %r 302} 303 304; Make sure vectors work too. 305 306define <4 x float> @maxnum_intrinsic_nnan_fmf_f432(<4 x float> %a, <4 x float> %b) { 307; SSE-LABEL: maxnum_intrinsic_nnan_fmf_f432: 308; SSE: # %bb.0: 309; SSE-NEXT: maxps %xmm1, %xmm0 310; SSE-NEXT: retq 311; 312; AVX-LABEL: maxnum_intrinsic_nnan_fmf_f432: 313; AVX: # %bb.0: 314; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 315; AVX-NEXT: retq 316 %r = tail call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) 317 ret <4 x float> %r 318} 319 320; Current (but legacy someday): a function-level attribute should also enable the fold. 321 322define float @maxnum_intrinsic_nnan_attr_f32(float %a, float %b) #0 { 323; SSE-LABEL: maxnum_intrinsic_nnan_attr_f32: 324; SSE: # %bb.0: 325; SSE-NEXT: maxss %xmm1, %xmm0 326; SSE-NEXT: retq 327; 328; AVX-LABEL: maxnum_intrinsic_nnan_attr_f32: 329; AVX: # %bb.0: 330; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 331; AVX-NEXT: retq 332 %r = tail call float @llvm.maxnum.f32(float %a, float %b) 333 ret float %r 334} 335 336; Make sure vectors work too. 337 338define <2 x double> @maxnum_intrinsic_nnan_attr_f64(<2 x double> %a, <2 x double> %b) #0 { 339; SSE-LABEL: maxnum_intrinsic_nnan_attr_f64: 340; SSE: # %bb.0: 341; SSE-NEXT: maxpd %xmm1, %xmm0 342; SSE-NEXT: retq 343; 344; AVX-LABEL: maxnum_intrinsic_nnan_attr_f64: 345; AVX: # %bb.0: 346; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 347; AVX-NEXT: retq 348 %r = tail call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) 349 ret <2 x double> %r 350} 351 352attributes #0 = { "no-nans-fp-math"="true" } 353 354