1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 6 7define float @roundeven_f32(float %x) { 8; SSE2-LABEL: roundeven_f32: 9; SSE2: ## %bb.0: 10; SSE2-NEXT: jmp _roundevenf ## TAILCALL 11; 12; SSE41-LABEL: roundeven_f32: 13; SSE41: ## %bb.0: 14; SSE41-NEXT: roundss $8, %xmm0, %xmm0 15; SSE41-NEXT: retq 16; 17; AVX-LABEL: roundeven_f32: 18; AVX: ## %bb.0: 19; AVX-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 20; AVX-NEXT: retq 21 %a = call float @llvm.roundeven.f32(float %x) 22 ret float %a 23} 24 25define double @roundeven_f64(double %x) { 26; SSE2-LABEL: roundeven_f64: 27; SSE2: ## %bb.0: 28; SSE2-NEXT: jmp _roundeven ## TAILCALL 29; 30; SSE41-LABEL: roundeven_f64: 31; SSE41: ## %bb.0: 32; SSE41-NEXT: roundsd $8, %xmm0, %xmm0 33; SSE41-NEXT: retq 34; 35; AVX-LABEL: roundeven_f64: 36; AVX: ## %bb.0: 37; AVX-NEXT: vroundsd $8, %xmm0, %xmm0, %xmm0 38; AVX-NEXT: retq 39 %a = call double @llvm.roundeven.f64(double %x) 40 ret double %a 41} 42 43define <4 x float> @roundeven_v4f32(<4 x float> %x) { 44; SSE2-LABEL: roundeven_v4f32: 45; SSE2: ## %bb.0: 46; SSE2-NEXT: subq $56, %rsp 47; SSE2-NEXT: .cfi_def_cfa_offset 64 48; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 49; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 50; SSE2-NEXT: callq _roundevenf 51; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 52; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 53; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 54; SSE2-NEXT: callq _roundevenf 55; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 56; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 57; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 58; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 59; SSE2-NEXT: callq _roundevenf 60; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 61; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 62; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 63; SSE2-NEXT: callq _roundevenf 64; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 65; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 66; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 67; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 68; SSE2-NEXT: movaps %xmm1, %xmm0 69; SSE2-NEXT: addq $56, %rsp 70; SSE2-NEXT: retq 71; 72; SSE41-LABEL: roundeven_v4f32: 73; SSE41: ## %bb.0: 74; SSE41-NEXT: roundps $8, %xmm0, %xmm0 75; SSE41-NEXT: retq 76; 77; AVX-LABEL: roundeven_v4f32: 78; AVX: ## %bb.0: 79; AVX-NEXT: vroundps $8, %xmm0, %xmm0 80; AVX-NEXT: retq 81 %a = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) 82 ret <4 x float> %a 83} 84 85define <2 x double> @roundeven_v2f64(<2 x double> %x) { 86; SSE2-LABEL: roundeven_v2f64: 87; SSE2: ## %bb.0: 88; SSE2-NEXT: subq $40, %rsp 89; SSE2-NEXT: .cfi_def_cfa_offset 48 90; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 91; SSE2-NEXT: callq _roundeven 92; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 93; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 94; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 95; SSE2-NEXT: callq _roundeven 96; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 97; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 98; SSE2-NEXT: movaps %xmm1, %xmm0 99; SSE2-NEXT: addq $40, %rsp 100; SSE2-NEXT: retq 101; 102; SSE41-LABEL: roundeven_v2f64: 103; SSE41: ## %bb.0: 104; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 105; SSE41-NEXT: retq 106; 107; AVX-LABEL: roundeven_v2f64: 108; AVX: ## %bb.0: 109; AVX-NEXT: vroundpd $8, %xmm0, %xmm0 110; AVX-NEXT: retq 111 %a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) 112 ret <2 x double> %a 113} 114 115define <8 x float> @roundeven_v8f32(<8 x float> %x) { 116; SSE2-LABEL: roundeven_v8f32: 117; SSE2: ## %bb.0: 118; SSE2-NEXT: subq $72, %rsp 119; SSE2-NEXT: .cfi_def_cfa_offset 80 120; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 121; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 122; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 123; SSE2-NEXT: callq _roundevenf 124; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 125; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 126; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 127; SSE2-NEXT: callq _roundevenf 128; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 129; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 130; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 131; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 132; SSE2-NEXT: callq _roundevenf 133; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 134; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 135; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 136; SSE2-NEXT: callq _roundevenf 137; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 138; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 139; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 140; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 141; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 142; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 143; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 144; SSE2-NEXT: callq _roundevenf 145; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 146; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 147; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 148; SSE2-NEXT: callq _roundevenf 149; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 150; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 151; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 152; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 153; SSE2-NEXT: callq _roundevenf 154; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 155; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 156; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 157; SSE2-NEXT: callq _roundevenf 158; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 159; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 160; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload 161; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 162; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 163; SSE2-NEXT: addq $72, %rsp 164; SSE2-NEXT: retq 165; 166; SSE41-LABEL: roundeven_v8f32: 167; SSE41: ## %bb.0: 168; SSE41-NEXT: roundps $8, %xmm0, %xmm0 169; SSE41-NEXT: roundps $8, %xmm1, %xmm1 170; SSE41-NEXT: retq 171; 172; AVX-LABEL: roundeven_v8f32: 173; AVX: ## %bb.0: 174; AVX-NEXT: vroundps $8, %ymm0, %ymm0 175; AVX-NEXT: retq 176 %a = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %x) 177 ret <8 x float> %a 178} 179 180define <4 x double> @roundeven_v4f64(<4 x double> %x) { 181; SSE2-LABEL: roundeven_v4f64: 182; SSE2: ## %bb.0: 183; SSE2-NEXT: subq $56, %rsp 184; SSE2-NEXT: .cfi_def_cfa_offset 64 185; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 186; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 187; SSE2-NEXT: callq _roundeven 188; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 189; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 190; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 191; SSE2-NEXT: callq _roundeven 192; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 193; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 194; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 195; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 196; SSE2-NEXT: callq _roundeven 197; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 198; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 199; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 200; SSE2-NEXT: callq _roundeven 201; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 202; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 203; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 204; SSE2-NEXT: addq $56, %rsp 205; SSE2-NEXT: retq 206; 207; SSE41-LABEL: roundeven_v4f64: 208; SSE41: ## %bb.0: 209; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 210; SSE41-NEXT: roundpd $8, %xmm1, %xmm1 211; SSE41-NEXT: retq 212; 213; AVX-LABEL: roundeven_v4f64: 214; AVX: ## %bb.0: 215; AVX-NEXT: vroundpd $8, %ymm0, %ymm0 216; AVX-NEXT: retq 217 %a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x) 218 ret <4 x double> %a 219} 220 221define <16 x float> @roundeven_v16f32(<16 x float> %x) { 222; SSE2-LABEL: roundeven_v16f32: 223; SSE2: ## %bb.0: 224; SSE2-NEXT: subq $104, %rsp 225; SSE2-NEXT: .cfi_def_cfa_offset 112 226; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 227; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 228; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 229; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 230; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 231; SSE2-NEXT: callq _roundevenf 232; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 233; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 234; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 235; SSE2-NEXT: callq _roundevenf 236; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 237; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 238; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 239; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 240; SSE2-NEXT: callq _roundevenf 241; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 242; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 243; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 244; SSE2-NEXT: callq _roundevenf 245; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 246; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 247; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 248; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 249; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 250; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 251; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 252; SSE2-NEXT: callq _roundevenf 253; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 254; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 255; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 256; SSE2-NEXT: callq _roundevenf 257; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 258; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 259; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 260; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 261; SSE2-NEXT: callq _roundevenf 262; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 263; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 264; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 265; SSE2-NEXT: callq _roundevenf 266; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 267; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 268; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 269; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 270; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 271; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 272; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 273; SSE2-NEXT: callq _roundevenf 274; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 275; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 276; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 277; SSE2-NEXT: callq _roundevenf 278; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload 279; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 280; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 281; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 282; SSE2-NEXT: callq _roundevenf 283; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 284; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 285; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 286; SSE2-NEXT: callq _roundevenf 287; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 288; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 289; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload 290; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] 291; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 292; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 293; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 294; SSE2-NEXT: callq _roundevenf 295; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 296; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 297; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 298; SSE2-NEXT: callq _roundevenf 299; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload 300; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 301; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 302; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 303; SSE2-NEXT: callq _roundevenf 304; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 305; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 306; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 307; SSE2-NEXT: callq _roundevenf 308; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 309; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 310; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload 311; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] 312; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 313; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 314; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 315; SSE2-NEXT: addq $104, %rsp 316; SSE2-NEXT: retq 317; 318; SSE41-LABEL: roundeven_v16f32: 319; SSE41: ## %bb.0: 320; SSE41-NEXT: roundps $8, %xmm0, %xmm0 321; SSE41-NEXT: roundps $8, %xmm1, %xmm1 322; SSE41-NEXT: roundps $8, %xmm2, %xmm2 323; SSE41-NEXT: roundps $8, %xmm3, %xmm3 324; SSE41-NEXT: retq 325; 326; AVX1-LABEL: roundeven_v16f32: 327; AVX1: ## %bb.0: 328; AVX1-NEXT: vroundps $8, %ymm0, %ymm0 329; AVX1-NEXT: vroundps $8, %ymm1, %ymm1 330; AVX1-NEXT: retq 331; 332; AVX512-LABEL: roundeven_v16f32: 333; AVX512: ## %bb.0: 334; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0 335; AVX512-NEXT: retq 336 %a = call <16 x float> @llvm.roundeven.v16f32(<16 x float> %x) 337 ret <16 x float> %a 338} 339 340define <8 x double> @roundeven_v8f64(<8 x double> %x) { 341; SSE2-LABEL: roundeven_v8f64: 342; SSE2: ## %bb.0: 343; SSE2-NEXT: subq $88, %rsp 344; SSE2-NEXT: .cfi_def_cfa_offset 96 345; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 346; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 347; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 348; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 349; SSE2-NEXT: callq _roundeven 350; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 351; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 352; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 353; SSE2-NEXT: callq _roundeven 354; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 355; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 356; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 357; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 358; SSE2-NEXT: callq _roundeven 359; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 360; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload 361; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 362; SSE2-NEXT: callq _roundeven 363; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 364; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 365; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 366; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 367; SSE2-NEXT: callq _roundeven 368; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 369; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 370; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 371; SSE2-NEXT: callq _roundeven 372; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload 373; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 374; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill 375; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 376; SSE2-NEXT: callq _roundeven 377; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill 378; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 379; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 380; SSE2-NEXT: callq _roundeven 381; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload 382; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] 383; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload 384; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload 385; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload 386; SSE2-NEXT: addq $88, %rsp 387; SSE2-NEXT: retq 388; 389; SSE41-LABEL: roundeven_v8f64: 390; SSE41: ## %bb.0: 391; SSE41-NEXT: roundpd $8, %xmm0, %xmm0 392; SSE41-NEXT: roundpd $8, %xmm1, %xmm1 393; SSE41-NEXT: roundpd $8, %xmm2, %xmm2 394; SSE41-NEXT: roundpd $8, %xmm3, %xmm3 395; SSE41-NEXT: retq 396; 397; AVX1-LABEL: roundeven_v8f64: 398; AVX1: ## %bb.0: 399; AVX1-NEXT: vroundpd $8, %ymm0, %ymm0 400; AVX1-NEXT: vroundpd $8, %ymm1, %ymm1 401; AVX1-NEXT: retq 402; 403; AVX512-LABEL: roundeven_v8f64: 404; AVX512: ## %bb.0: 405; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0 406; AVX512-NEXT: retq 407 %a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x) 408 ret <8 x double> %a 409} 410 411declare float @llvm.roundeven.f32(float) 412declare double @llvm.roundeven.f64(double) 413declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) 414declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) 415declare <8 x float> @llvm.roundeven.v8f32(<8 x float>) 416declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) 417declare <16 x float> @llvm.roundeven.v16f32(<16 x float>) 418declare <8 x double> @llvm.roundeven.v8f64(<8 x double>) 419