1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX 11 12; It's the extra tests coverage for recip as discussed on D26855. 13 14define float @f32_no_step_2(float %x) #3 { 15; SSE-LABEL: f32_no_step_2: 16; SSE: # %bb.0: 17; SSE-NEXT: rcpss %xmm0, %xmm0 18; SSE-NEXT: mulss {{.*}}(%rip), %xmm0 19; SSE-NEXT: retq 20; 21; AVX-RECIP-LABEL: f32_no_step_2: 22; AVX-RECIP: # %bb.0: 23; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 24; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 25; AVX-RECIP-NEXT: retq 26; 27; FMA-RECIP-LABEL: f32_no_step_2: 28; FMA-RECIP: # %bb.0: 29; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0 30; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 31; FMA-RECIP-NEXT: retq 32; 33; BTVER2-LABEL: f32_no_step_2: 34; BTVER2: # %bb.0: 35; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00] 36; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 37; BTVER2-NEXT: retq # sched: [4:1.00] 38; 39; SANDY-LABEL: f32_no_step_2: 40; SANDY: # %bb.0: 41; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 42; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 43; SANDY-NEXT: retq # sched: [1:1.00] 44; 45; HASWELL-LABEL: f32_no_step_2: 46; HASWELL: # %bb.0: 47; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 48; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 49; HASWELL-NEXT: retq # sched: [7:1.00] 50; 51; HASWELL-NO-FMA-LABEL: f32_no_step_2: 52; HASWELL-NO-FMA: # %bb.0: 53; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 54; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 55; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 56; 57; KNL-LABEL: f32_no_step_2: 58; KNL: # %bb.0: 59; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] 60; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 61; KNL-NEXT: retq # sched: [7:1.00] 62; 63; SKX-LABEL: f32_no_step_2: 64; SKX: # %bb.0: 65; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] 66; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] 67; SKX-NEXT: retq # sched: [7:1.00] 68 %div = fdiv fast float 1234.0, %x 69 ret float %div 70} 71 72define float @f32_one_step_2(float %x) #1 { 73; SSE-LABEL: f32_one_step_2: 74; SSE: # %bb.0: 75; SSE-NEXT: rcpss %xmm0, %xmm2 76; SSE-NEXT: mulss %xmm2, %xmm0 77; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 78; SSE-NEXT: subss %xmm0, %xmm1 79; SSE-NEXT: mulss %xmm2, %xmm1 80; SSE-NEXT: addss %xmm2, %xmm1 81; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 82; SSE-NEXT: movaps %xmm1, %xmm0 83; SSE-NEXT: retq 84; 85; AVX-RECIP-LABEL: f32_one_step_2: 86; AVX-RECIP: # %bb.0: 87; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 88; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 89; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 90; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 91; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 92; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 93; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 94; AVX-RECIP-NEXT: retq 95; 96; FMA-RECIP-LABEL: f32_one_step_2: 97; FMA-RECIP: # %bb.0: 98; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 99; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 100; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 101; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 102; FMA-RECIP-NEXT: retq 103; 104; BTVER2-LABEL: f32_one_step_2: 105; BTVER2: # %bb.0: 106; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] 107; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 108; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 109; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 110; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 111; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 112; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 113; BTVER2-NEXT: retq # sched: [4:1.00] 114; 115; SANDY-LABEL: f32_one_step_2: 116; SANDY: # %bb.0: 117; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 118; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 119; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] 120; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 121; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 122; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 123; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 124; SANDY-NEXT: retq # sched: [1:1.00] 125; 126; HASWELL-LABEL: f32_one_step_2: 127; HASWELL: # %bb.0: 128; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 129; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 130; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 131; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 132; HASWELL-NEXT: retq # sched: [7:1.00] 133; 134; HASWELL-NO-FMA-LABEL: f32_one_step_2: 135; HASWELL-NO-FMA: # %bb.0: 136; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 137; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 138; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 139; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 140; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 141; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 142; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 143; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 144; 145; KNL-LABEL: f32_one_step_2: 146; KNL: # %bb.0: 147; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 148; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 149; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 150; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 151; KNL-NEXT: retq # sched: [7:1.00] 152; 153; SKX-LABEL: f32_one_step_2: 154; SKX: # %bb.0: 155; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 156; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50] 157; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 158; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] 159; SKX-NEXT: retq # sched: [7:1.00] 160 %div = fdiv fast float 3456.0, %x 161 ret float %div 162} 163 164define float @f32_one_step_2_divs(float %x) #1 { 165; SSE-LABEL: f32_one_step_2_divs: 166; SSE: # %bb.0: 167; SSE-NEXT: rcpss %xmm0, %xmm1 168; SSE-NEXT: mulss %xmm1, %xmm0 169; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 170; SSE-NEXT: subss %xmm0, %xmm2 171; SSE-NEXT: mulss %xmm1, %xmm2 172; SSE-NEXT: addss %xmm1, %xmm2 173; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 174; SSE-NEXT: mulss %xmm2, %xmm0 175; SSE-NEXT: mulss %xmm2, %xmm0 176; SSE-NEXT: retq 177; 178; AVX-RECIP-LABEL: f32_one_step_2_divs: 179; AVX-RECIP: # %bb.0: 180; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 181; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 182; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 183; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0 184; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 185; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 186; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 187; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 188; AVX-RECIP-NEXT: retq 189; 190; FMA-RECIP-LABEL: f32_one_step_2_divs: 191; FMA-RECIP: # %bb.0: 192; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 193; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 194; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 195; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 196; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 197; FMA-RECIP-NEXT: retq 198; 199; BTVER2-LABEL: f32_one_step_2_divs: 200; BTVER2: # %bb.0: 201; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] 202; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 203; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 204; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 205; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 206; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 207; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00] 208; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 209; BTVER2-NEXT: retq # sched: [4:1.00] 210; 211; SANDY-LABEL: f32_one_step_2_divs: 212; SANDY: # %bb.0: 213; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 214; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 215; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50] 216; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 217; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 218; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 219; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00] 220; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 221; SANDY-NEXT: retq # sched: [1:1.00] 222; 223; HASWELL-LABEL: f32_one_step_2_divs: 224; HASWELL: # %bb.0: 225; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 226; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 227; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 228; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 229; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 230; HASWELL-NEXT: retq # sched: [7:1.00] 231; 232; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs: 233; HASWELL-NO-FMA: # %bb.0: 234; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 235; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 236; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 237; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 238; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 239; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 240; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 241; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 242; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 243; 244; KNL-LABEL: f32_one_step_2_divs: 245; KNL: # %bb.0: 246; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 247; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 248; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 249; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 250; KNL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 251; KNL-NEXT: retq # sched: [7:1.00] 252; 253; SKX-LABEL: f32_one_step_2_divs: 254; SKX: # %bb.0: 255; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 256; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50] 257; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 258; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50] 259; SKX-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50] 260; SKX-NEXT: retq # sched: [7:1.00] 261 %div = fdiv fast float 3456.0, %x 262 %div2 = fdiv fast float %div, %x 263 ret float %div2 264} 265 266define float @f32_two_step_2(float %x) #2 { 267; SSE-LABEL: f32_two_step_2: 268; SSE: # %bb.0: 269; SSE-NEXT: rcpss %xmm0, %xmm2 270; SSE-NEXT: movaps %xmm0, %xmm3 271; SSE-NEXT: mulss %xmm2, %xmm3 272; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 273; SSE-NEXT: movaps %xmm1, %xmm4 274; SSE-NEXT: subss %xmm3, %xmm4 275; SSE-NEXT: mulss %xmm2, %xmm4 276; SSE-NEXT: addss %xmm2, %xmm4 277; SSE-NEXT: mulss %xmm4, %xmm0 278; SSE-NEXT: subss %xmm0, %xmm1 279; SSE-NEXT: mulss %xmm4, %xmm1 280; SSE-NEXT: addss %xmm4, %xmm1 281; SSE-NEXT: mulss {{.*}}(%rip), %xmm1 282; SSE-NEXT: movaps %xmm1, %xmm0 283; SSE-NEXT: retq 284; 285; AVX-RECIP-LABEL: f32_two_step_2: 286; AVX-RECIP: # %bb.0: 287; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 288; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2 289; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 290; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2 291; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2 292; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1 293; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0 294; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0 295; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 296; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0 297; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 298; AVX-RECIP-NEXT: retq 299; 300; FMA-RECIP-LABEL: f32_two_step_2: 301; FMA-RECIP: # %bb.0: 302; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 303; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 304; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 305; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 306; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 307; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 308; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 309; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 310; FMA-RECIP-NEXT: retq 311; 312; BTVER2-LABEL: f32_two_step_2: 313; BTVER2: # %bb.0: 314; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00] 315; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00] 316; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00] 317; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 318; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00] 319; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 320; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 321; BTVER2-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 322; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 323; BTVER2-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 324; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 325; BTVER2-NEXT: retq # sched: [4:1.00] 326; 327; SANDY-LABEL: f32_two_step_2: 328; SANDY: # %bb.0: 329; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 330; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00] 331; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50] 332; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 333; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00] 334; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 335; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 336; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 337; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 338; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 339; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 340; SANDY-NEXT: retq # sched: [1:1.00] 341; 342; HASWELL-LABEL: f32_two_step_2: 343; HASWELL: # %bb.0: 344; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 345; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 346; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 347; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 348; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 349; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 350; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 351; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 352; HASWELL-NEXT: retq # sched: [7:1.00] 353; 354; HASWELL-NO-FMA-LABEL: f32_two_step_2: 355; HASWELL-NO-FMA: # %bb.0: 356; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 357; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50] 358; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:0.50] 359; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 360; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50] 361; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 362; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 363; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 364; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 365; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 366; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 367; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 368; 369; KNL-LABEL: f32_two_step_2: 370; KNL: # %bb.0: 371; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00] 372; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 373; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 374; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 375; KNL-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 376; KNL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 377; KNL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 378; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 379; KNL-NEXT: retq # sched: [7:1.00] 380; 381; SKX-LABEL: f32_two_step_2: 382; SKX: # %bb.0: 383; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00] 384; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50] 385; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33] 386; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50] 387; SKX-NEXT: vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50] 388; SKX-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50] 389; SKX-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50] 390; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50] 391; SKX-NEXT: retq # sched: [7:1.00] 392 %div = fdiv fast float 6789.0, %x 393 ret float %div 394} 395 396define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 { 397; SSE-LABEL: v4f32_one_step2: 398; SSE: # %bb.0: 399; SSE-NEXT: rcpps %xmm0, %xmm2 400; SSE-NEXT: mulps %xmm2, %xmm0 401; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 402; SSE-NEXT: subps %xmm0, %xmm1 403; SSE-NEXT: mulps %xmm2, %xmm1 404; SSE-NEXT: addps %xmm2, %xmm1 405; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 406; SSE-NEXT: movaps %xmm1, %xmm0 407; SSE-NEXT: retq 408; 409; AVX-RECIP-LABEL: v4f32_one_step2: 410; AVX-RECIP: # %bb.0: 411; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 412; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 413; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 414; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 415; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 416; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 417; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 418; AVX-RECIP-NEXT: retq 419; 420; FMA-RECIP-LABEL: v4f32_one_step2: 421; FMA-RECIP: # %bb.0: 422; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 423; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 424; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 425; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 426; FMA-RECIP-NEXT: retq 427; 428; BTVER2-LABEL: v4f32_one_step2: 429; BTVER2: # %bb.0: 430; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 431; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 432; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 433; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 434; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 435; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 436; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 437; BTVER2-NEXT: retq # sched: [4:1.00] 438; 439; SANDY-LABEL: v4f32_one_step2: 440; SANDY: # %bb.0: 441; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 442; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 443; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 444; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 445; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 446; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 447; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 448; SANDY-NEXT: retq # sched: [1:1.00] 449; 450; HASWELL-LABEL: v4f32_one_step2: 451; HASWELL: # %bb.0: 452; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 453; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 454; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 455; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 456; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 457; HASWELL-NEXT: retq # sched: [7:1.00] 458; 459; HASWELL-NO-FMA-LABEL: v4f32_one_step2: 460; HASWELL-NO-FMA: # %bb.0: 461; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 462; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 463; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 464; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 465; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 466; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 467; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 468; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 469; 470; KNL-LABEL: v4f32_one_step2: 471; KNL: # %bb.0: 472; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 473; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 474; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 475; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 476; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 477; KNL-NEXT: retq # sched: [7:1.00] 478; 479; SKX-LABEL: v4f32_one_step2: 480; SKX: # %bb.0: 481; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 482; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 483; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 484; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 485; SKX-NEXT: retq # sched: [7:1.00] 486 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 487 ret <4 x float> %div 488} 489 490define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { 491; SSE-LABEL: v4f32_one_step_2_divs: 492; SSE: # %bb.0: 493; SSE-NEXT: rcpps %xmm0, %xmm1 494; SSE-NEXT: mulps %xmm1, %xmm0 495; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 496; SSE-NEXT: subps %xmm0, %xmm2 497; SSE-NEXT: mulps %xmm1, %xmm2 498; SSE-NEXT: addps %xmm1, %xmm2 499; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 500; SSE-NEXT: mulps %xmm2, %xmm0 501; SSE-NEXT: mulps %xmm2, %xmm0 502; SSE-NEXT: retq 503; 504; AVX-RECIP-LABEL: v4f32_one_step_2_divs: 505; AVX-RECIP: # %bb.0: 506; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 507; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 508; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 509; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 510; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 511; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 512; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 513; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 514; AVX-RECIP-NEXT: retq 515; 516; FMA-RECIP-LABEL: v4f32_one_step_2_divs: 517; FMA-RECIP: # %bb.0: 518; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 519; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem 520; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 521; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 522; FMA-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 523; FMA-RECIP-NEXT: retq 524; 525; BTVER2-LABEL: v4f32_one_step_2_divs: 526; BTVER2: # %bb.0: 527; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 528; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 529; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 530; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 531; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 532; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 533; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00] 534; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 535; BTVER2-NEXT: retq # sched: [4:1.00] 536; 537; SANDY-LABEL: v4f32_one_step_2_divs: 538; SANDY: # %bb.0: 539; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 540; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 541; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 542; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 543; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 544; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 545; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00] 546; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 547; SANDY-NEXT: retq # sched: [1:1.00] 548; 549; HASWELL-LABEL: v4f32_one_step_2_divs: 550; HASWELL: # %bb.0: 551; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 552; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 553; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 554; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 555; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50] 556; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 557; HASWELL-NEXT: retq # sched: [7:1.00] 558; 559; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: 560; HASWELL-NO-FMA: # %bb.0: 561; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 562; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 563; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 564; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00] 565; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 566; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 567; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50] 568; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 569; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 570; 571; KNL-LABEL: v4f32_one_step_2_divs: 572; KNL: # %bb.0: 573; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 574; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 575; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50] 576; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50] 577; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50] 578; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 579; KNL-NEXT: retq # sched: [7:1.00] 580; 581; SKX-LABEL: v4f32_one_step_2_divs: 582; SKX: # %bb.0: 583; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 584; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50] 585; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50] 586; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50] 587; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50] 588; SKX-NEXT: retq # sched: [7:1.00] 589 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 590 %div2 = fdiv fast <4 x float> %div, %x 591 ret <4 x float> %div2 592} 593 594define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { 595; SSE-LABEL: v4f32_two_step2: 596; SSE: # %bb.0: 597; SSE-NEXT: rcpps %xmm0, %xmm2 598; SSE-NEXT: movaps %xmm0, %xmm3 599; SSE-NEXT: mulps %xmm2, %xmm3 600; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 601; SSE-NEXT: movaps %xmm1, %xmm4 602; SSE-NEXT: subps %xmm3, %xmm4 603; SSE-NEXT: mulps %xmm2, %xmm4 604; SSE-NEXT: addps %xmm2, %xmm4 605; SSE-NEXT: mulps %xmm4, %xmm0 606; SSE-NEXT: subps %xmm0, %xmm1 607; SSE-NEXT: mulps %xmm4, %xmm1 608; SSE-NEXT: addps %xmm4, %xmm1 609; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 610; SSE-NEXT: movaps %xmm1, %xmm0 611; SSE-NEXT: retq 612; 613; AVX-RECIP-LABEL: v4f32_two_step2: 614; AVX-RECIP: # %bb.0: 615; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 616; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 617; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 618; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 619; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 620; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 621; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 622; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0 623; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 624; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 625; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 626; AVX-RECIP-NEXT: retq 627; 628; FMA-RECIP-LABEL: v4f32_two_step2: 629; FMA-RECIP: # %bb.0: 630; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 631; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 632; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 633; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 634; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 635; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 636; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 637; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 638; FMA-RECIP-NEXT: retq 639; 640; BTVER2-LABEL: v4f32_two_step2: 641; BTVER2: # %bb.0: 642; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 643; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00] 644; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00] 645; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 646; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00] 647; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 648; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00] 649; BTVER2-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 650; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00] 651; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 652; BTVER2-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00] 653; BTVER2-NEXT: retq # sched: [4:1.00] 654; 655; SANDY-LABEL: v4f32_two_step2: 656; SANDY: # %bb.0: 657; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 658; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00] 659; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50] 660; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 661; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00] 662; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 663; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00] 664; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 665; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00] 666; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 667; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00] 668; SANDY-NEXT: retq # sched: [1:1.00] 669; 670; HASWELL-LABEL: v4f32_two_step2: 671; HASWELL: # %bb.0: 672; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 673; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 674; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 675; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 676; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 677; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 678; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 679; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 680; HASWELL-NEXT: retq # sched: [7:1.00] 681; 682; HASWELL-NO-FMA-LABEL: v4f32_two_step2: 683; HASWELL-NO-FMA: # %bb.0: 684; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 685; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50] 686; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50] 687; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00] 688; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50] 689; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00] 690; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50] 691; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00] 692; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50] 693; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00] 694; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 695; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 696; 697; KNL-LABEL: v4f32_two_step2: 698; KNL: # %bb.0: 699; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00] 700; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 701; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00] 702; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50] 703; KNL-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50] 704; KNL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50] 705; KNL-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50] 706; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50] 707; KNL-NEXT: retq # sched: [7:1.00] 708; 709; SKX-LABEL: v4f32_two_step2: 710; SKX: # %bb.0: 711; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00] 712; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50] 713; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33] 714; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50] 715; SKX-NEXT: vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50] 716; SKX-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50] 717; SKX-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50] 718; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50] 719; SKX-NEXT: retq # sched: [7:1.00] 720 %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x 721 ret <4 x float> %div 722} 723 724define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 { 725; SSE-LABEL: v8f32_one_step2: 726; SSE: # %bb.0: 727; SSE-NEXT: rcpps %xmm1, %xmm4 728; SSE-NEXT: mulps %xmm4, %xmm1 729; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 730; SSE-NEXT: movaps %xmm2, %xmm3 731; SSE-NEXT: subps %xmm1, %xmm3 732; SSE-NEXT: mulps %xmm4, %xmm3 733; SSE-NEXT: addps %xmm4, %xmm3 734; SSE-NEXT: rcpps %xmm0, %xmm1 735; SSE-NEXT: mulps %xmm1, %xmm0 736; SSE-NEXT: subps %xmm0, %xmm2 737; SSE-NEXT: mulps %xmm1, %xmm2 738; SSE-NEXT: addps %xmm1, %xmm2 739; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 740; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 741; SSE-NEXT: movaps %xmm2, %xmm0 742; SSE-NEXT: movaps %xmm3, %xmm1 743; SSE-NEXT: retq 744; 745; AVX-RECIP-LABEL: v8f32_one_step2: 746; AVX-RECIP: # %bb.0: 747; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 748; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 749; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 750; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 751; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 752; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 753; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 754; AVX-RECIP-NEXT: retq 755; 756; FMA-RECIP-LABEL: v8f32_one_step2: 757; FMA-RECIP: # %bb.0: 758; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 759; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem 760; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 761; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 762; FMA-RECIP-NEXT: retq 763; 764; BTVER2-LABEL: v8f32_one_step2: 765; BTVER2: # %bb.0: 766; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 767; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 768; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 769; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 770; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 771; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 772; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 773; BTVER2-NEXT: retq # sched: [4:1.00] 774; 775; SANDY-LABEL: v8f32_one_step2: 776; SANDY: # %bb.0: 777; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 778; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 779; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 780; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 781; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 782; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 783; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 784; SANDY-NEXT: retq # sched: [1:1.00] 785; 786; HASWELL-LABEL: v8f32_one_step2: 787; HASWELL: # %bb.0: 788; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 789; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 790; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 791; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 792; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 793; HASWELL-NEXT: retq # sched: [7:1.00] 794; 795; HASWELL-NO-FMA-LABEL: v8f32_one_step2: 796; HASWELL-NO-FMA: # %bb.0: 797; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 798; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] 799; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 800; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 801; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 802; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 803; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 804; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 805; 806; KNL-LABEL: v8f32_one_step2: 807; KNL: # %bb.0: 808; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 809; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 810; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 811; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 812; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 813; KNL-NEXT: retq # sched: [7:1.00] 814; 815; SKX-LABEL: v8f32_one_step2: 816; SKX: # %bb.0: 817; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 818; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50] 819; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50] 820; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] 821; SKX-NEXT: retq # sched: [7:1.00] 822 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 823 ret <8 x float> %div 824} 825 826define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { 827; SSE-LABEL: v8f32_one_step_2_divs: 828; SSE: # %bb.0: 829; SSE-NEXT: rcpps %xmm0, %xmm2 830; SSE-NEXT: mulps %xmm2, %xmm0 831; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 832; SSE-NEXT: movaps %xmm3, %xmm4 833; SSE-NEXT: subps %xmm0, %xmm4 834; SSE-NEXT: mulps %xmm2, %xmm4 835; SSE-NEXT: addps %xmm2, %xmm4 836; SSE-NEXT: rcpps %xmm1, %xmm0 837; SSE-NEXT: mulps %xmm0, %xmm1 838; SSE-NEXT: subps %xmm1, %xmm3 839; SSE-NEXT: mulps %xmm0, %xmm3 840; SSE-NEXT: addps %xmm0, %xmm3 841; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] 842; SSE-NEXT: mulps %xmm3, %xmm1 843; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 844; SSE-NEXT: mulps %xmm4, %xmm0 845; SSE-NEXT: mulps %xmm4, %xmm0 846; SSE-NEXT: mulps %xmm3, %xmm1 847; SSE-NEXT: retq 848; 849; AVX-RECIP-LABEL: v8f32_one_step_2_divs: 850; AVX-RECIP: # %bb.0: 851; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 852; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 853; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 854; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 855; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 856; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 857; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 858; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 859; AVX-RECIP-NEXT: retq 860; 861; FMA-RECIP-LABEL: v8f32_one_step_2_divs: 862; FMA-RECIP: # %bb.0: 863; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 864; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem 865; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 866; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 867; FMA-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 868; FMA-RECIP-NEXT: retq 869; 870; BTVER2-LABEL: v8f32_one_step_2_divs: 871; BTVER2: # %bb.0: 872; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 873; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 874; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 875; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 876; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 877; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 878; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00] 879; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 880; BTVER2-NEXT: retq # sched: [4:1.00] 881; 882; SANDY-LABEL: v8f32_one_step_2_divs: 883; SANDY: # %bb.0: 884; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 885; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 886; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 887; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 888; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 889; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 890; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00] 891; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 892; SANDY-NEXT: retq # sched: [1:1.00] 893; 894; HASWELL-LABEL: v8f32_one_step_2_divs: 895; HASWELL: # %bb.0: 896; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 897; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 898; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 899; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 900; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50] 901; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 902; HASWELL-NEXT: retq # sched: [7:1.00] 903; 904; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: 905; HASWELL-NO-FMA: # %bb.0: 906; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 907; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] 908; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 909; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 910; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 911; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 912; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50] 913; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 914; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 915; 916; KNL-LABEL: v8f32_one_step_2_divs: 917; KNL: # %bb.0: 918; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 919; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 920; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50] 921; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50] 922; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50] 923; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 924; KNL-NEXT: retq # sched: [7:1.00] 925; 926; SKX-LABEL: v8f32_one_step_2_divs: 927; SKX: # %bb.0: 928; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 929; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50] 930; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50] 931; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50] 932; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50] 933; SKX-NEXT: retq # sched: [7:1.00] 934 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 935 %div2 = fdiv fast <8 x float> %div, %x 936 ret <8 x float> %div2 937} 938 939define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { 940; SSE-LABEL: v8f32_two_step2: 941; SSE: # %bb.0: 942; SSE-NEXT: movaps %xmm0, %xmm2 943; SSE-NEXT: rcpps %xmm1, %xmm3 944; SSE-NEXT: movaps %xmm1, %xmm4 945; SSE-NEXT: mulps %xmm3, %xmm4 946; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 947; SSE-NEXT: movaps %xmm0, %xmm5 948; SSE-NEXT: subps %xmm4, %xmm5 949; SSE-NEXT: mulps %xmm3, %xmm5 950; SSE-NEXT: addps %xmm3, %xmm5 951; SSE-NEXT: mulps %xmm5, %xmm1 952; SSE-NEXT: movaps %xmm0, %xmm3 953; SSE-NEXT: subps %xmm1, %xmm3 954; SSE-NEXT: mulps %xmm5, %xmm3 955; SSE-NEXT: addps %xmm5, %xmm3 956; SSE-NEXT: rcpps %xmm2, %xmm1 957; SSE-NEXT: movaps %xmm2, %xmm4 958; SSE-NEXT: mulps %xmm1, %xmm4 959; SSE-NEXT: movaps %xmm0, %xmm5 960; SSE-NEXT: subps %xmm4, %xmm5 961; SSE-NEXT: mulps %xmm1, %xmm5 962; SSE-NEXT: addps %xmm1, %xmm5 963; SSE-NEXT: mulps %xmm5, %xmm2 964; SSE-NEXT: subps %xmm2, %xmm0 965; SSE-NEXT: mulps %xmm5, %xmm0 966; SSE-NEXT: addps %xmm5, %xmm0 967; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 968; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 969; SSE-NEXT: movaps %xmm3, %xmm1 970; SSE-NEXT: retq 971; 972; AVX-RECIP-LABEL: v8f32_two_step2: 973; AVX-RECIP: # %bb.0: 974; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 975; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 976; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 977; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 978; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 979; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 980; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 981; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 982; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 983; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 984; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 985; AVX-RECIP-NEXT: retq 986; 987; FMA-RECIP-LABEL: v8f32_two_step2: 988; FMA-RECIP: # %bb.0: 989; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 990; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 991; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 992; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 993; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 994; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 995; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 996; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 997; FMA-RECIP-NEXT: retq 998; 999; BTVER2-LABEL: v8f32_two_step2: 1000; BTVER2: # %bb.0: 1001; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1002; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00] 1003; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00] 1004; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00] 1005; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00] 1006; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00] 1007; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00] 1008; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 1009; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00] 1010; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00] 1011; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1012; BTVER2-NEXT: retq # sched: [4:1.00] 1013; 1014; SANDY-LABEL: v8f32_two_step2: 1015; SANDY: # %bb.0: 1016; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00] 1017; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00] 1018; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1019; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] 1020; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00] 1021; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] 1022; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00] 1023; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1024; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00] 1025; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 1026; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1027; SANDY-NEXT: retq # sched: [1:1.00] 1028; 1029; HASWELL-LABEL: v8f32_two_step2: 1030; HASWELL: # %bb.0: 1031; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 1032; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1033; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] 1034; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50] 1035; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50] 1036; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50] 1037; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50] 1038; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1039; HASWELL-NEXT: retq # sched: [7:1.00] 1040; 1041; HASWELL-NO-FMA-LABEL: v8f32_two_step2: 1042; HASWELL-NO-FMA: # %bb.0: 1043; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 1044; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50] 1045; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1046; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00] 1047; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50] 1048; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00] 1049; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50] 1050; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1051; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50] 1052; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] 1053; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1054; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1055; 1056; KNL-LABEL: v8f32_two_step2: 1057; KNL: # %bb.0: 1058; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00] 1059; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1060; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00] 1061; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50] 1062; KNL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50] 1063; KNL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50] 1064; KNL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50] 1065; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1066; KNL-NEXT: retq # sched: [7:1.00] 1067; 1068; SKX-LABEL: v8f32_two_step2: 1069; SKX: # %bb.0: 1070; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00] 1071; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1072; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33] 1073; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50] 1074; SKX-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50] 1075; SKX-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50] 1076; SKX-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50] 1077; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] 1078; SKX-NEXT: retq # sched: [7:1.00] 1079 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 1080 ret <8 x float> %div 1081} 1082 1083define <8 x float> @v8f32_no_step(<8 x float> %x) #3 { 1084; SSE-LABEL: v8f32_no_step: 1085; SSE: # %bb.0: 1086; SSE-NEXT: rcpps %xmm0, %xmm0 1087; SSE-NEXT: rcpps %xmm1, %xmm1 1088; SSE-NEXT: retq 1089; 1090; AVX-RECIP-LABEL: v8f32_no_step: 1091; AVX-RECIP: # %bb.0: 1092; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1093; AVX-RECIP-NEXT: retq 1094; 1095; FMA-RECIP-LABEL: v8f32_no_step: 1096; FMA-RECIP: # %bb.0: 1097; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1098; FMA-RECIP-NEXT: retq 1099; 1100; BTVER2-LABEL: v8f32_no_step: 1101; BTVER2: # %bb.0: 1102; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1103; BTVER2-NEXT: retq # sched: [4:1.00] 1104; 1105; SANDY-LABEL: v8f32_no_step: 1106; SANDY: # %bb.0: 1107; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1108; SANDY-NEXT: retq # sched: [1:1.00] 1109; 1110; HASWELL-LABEL: v8f32_no_step: 1111; HASWELL: # %bb.0: 1112; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1113; HASWELL-NEXT: retq # sched: [7:1.00] 1114; 1115; HASWELL-NO-FMA-LABEL: v8f32_no_step: 1116; HASWELL-NO-FMA: # %bb.0: 1117; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1118; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1119; 1120; KNL-LABEL: v8f32_no_step: 1121; KNL: # %bb.0: 1122; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1123; KNL-NEXT: retq # sched: [7:1.00] 1124; 1125; SKX-LABEL: v8f32_no_step: 1126; SKX: # %bb.0: 1127; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] 1128; SKX-NEXT: retq # sched: [7:1.00] 1129 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1130 ret <8 x float> %div 1131} 1132 1133define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 { 1134; SSE-LABEL: v8f32_no_step2: 1135; SSE: # %bb.0: 1136; SSE-NEXT: rcpps %xmm1, %xmm1 1137; SSE-NEXT: rcpps %xmm0, %xmm0 1138; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1139; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 1140; SSE-NEXT: retq 1141; 1142; AVX-RECIP-LABEL: v8f32_no_step2: 1143; AVX-RECIP: # %bb.0: 1144; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1145; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1146; AVX-RECIP-NEXT: retq 1147; 1148; FMA-RECIP-LABEL: v8f32_no_step2: 1149; FMA-RECIP: # %bb.0: 1150; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1151; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1152; FMA-RECIP-NEXT: retq 1153; 1154; BTVER2-LABEL: v8f32_no_step2: 1155; BTVER2: # %bb.0: 1156; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1157; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1158; BTVER2-NEXT: retq # sched: [4:1.00] 1159; 1160; SANDY-LABEL: v8f32_no_step2: 1161; SANDY: # %bb.0: 1162; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1163; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1164; SANDY-NEXT: retq # sched: [1:1.00] 1165; 1166; HASWELL-LABEL: v8f32_no_step2: 1167; HASWELL: # %bb.0: 1168; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1169; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1170; HASWELL-NEXT: retq # sched: [7:1.00] 1171; 1172; HASWELL-NO-FMA-LABEL: v8f32_no_step2: 1173; HASWELL-NO-FMA: # %bb.0: 1174; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1175; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1176; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1177; 1178; KNL-LABEL: v8f32_no_step2: 1179; KNL: # %bb.0: 1180; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1181; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1182; KNL-NEXT: retq # sched: [7:1.00] 1183; 1184; SKX-LABEL: v8f32_no_step2: 1185; SKX: # %bb.0: 1186; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00] 1187; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50] 1188; SKX-NEXT: retq # sched: [7:1.00] 1189 %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x 1190 ret <8 x float> %div 1191} 1192 1193define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 { 1194; SSE-LABEL: v16f32_one_step2: 1195; SSE: # %bb.0: 1196; SSE-NEXT: movaps %xmm3, %xmm4 1197; SSE-NEXT: movaps %xmm2, %xmm5 1198; SSE-NEXT: movaps %xmm0, %xmm6 1199; SSE-NEXT: rcpps %xmm3, %xmm2 1200; SSE-NEXT: mulps %xmm2, %xmm4 1201; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1202; SSE-NEXT: movaps %xmm0, %xmm3 1203; SSE-NEXT: subps %xmm4, %xmm3 1204; SSE-NEXT: mulps %xmm2, %xmm3 1205; SSE-NEXT: addps %xmm2, %xmm3 1206; SSE-NEXT: rcpps %xmm5, %xmm4 1207; SSE-NEXT: mulps %xmm4, %xmm5 1208; SSE-NEXT: movaps %xmm0, %xmm2 1209; SSE-NEXT: subps %xmm5, %xmm2 1210; SSE-NEXT: mulps %xmm4, %xmm2 1211; SSE-NEXT: addps %xmm4, %xmm2 1212; SSE-NEXT: rcpps %xmm1, %xmm5 1213; SSE-NEXT: mulps %xmm5, %xmm1 1214; SSE-NEXT: movaps %xmm0, %xmm4 1215; SSE-NEXT: subps %xmm1, %xmm4 1216; SSE-NEXT: mulps %xmm5, %xmm4 1217; SSE-NEXT: addps %xmm5, %xmm4 1218; SSE-NEXT: rcpps %xmm6, %xmm1 1219; SSE-NEXT: mulps %xmm1, %xmm6 1220; SSE-NEXT: subps %xmm6, %xmm0 1221; SSE-NEXT: mulps %xmm1, %xmm0 1222; SSE-NEXT: addps %xmm1, %xmm0 1223; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1224; SSE-NEXT: mulps {{.*}}(%rip), %xmm4 1225; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 1226; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 1227; SSE-NEXT: movaps %xmm4, %xmm1 1228; SSE-NEXT: retq 1229; 1230; AVX-RECIP-LABEL: v16f32_one_step2: 1231; AVX-RECIP: # %bb.0: 1232; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1233; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1234; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1235; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1236; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1237; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1238; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1239; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1240; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1241; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1242; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1243; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1244; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1245; AVX-RECIP-NEXT: retq 1246; 1247; FMA-RECIP-LABEL: v16f32_one_step2: 1248; FMA-RECIP: # %bb.0: 1249; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1250; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1251; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 1252; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 1253; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1254; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 1255; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 1256; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1257; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1258; FMA-RECIP-NEXT: retq 1259; 1260; BTVER2-LABEL: v16f32_one_step2: 1261; BTVER2: # %bb.0: 1262; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1263; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00] 1264; BTVER2-NEXT: vrcpps %ymm0, %ymm4 # sched: [2:2.00] 1265; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00] 1266; BTVER2-NEXT: vmulps %ymm4, %ymm0, %ymm0 # sched: [2:2.00] 1267; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00] 1268; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 1269; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1270; BTVER2-NEXT: vmulps %ymm0, %ymm4, %ymm0 # sched: [2:2.00] 1271; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00] 1272; BTVER2-NEXT: vaddps %ymm0, %ymm4, %ymm0 # sched: [3:2.00] 1273; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1274; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00] 1275; BTVER2-NEXT: retq # sched: [4:1.00] 1276; 1277; SANDY-LABEL: v16f32_one_step2: 1278; SANDY: # %bb.0: 1279; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00] 1280; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00] 1281; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1282; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1283; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 1284; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1285; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 1286; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 1287; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1288; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 1289; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1290; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1291; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00] 1292; SANDY-NEXT: retq # sched: [1:1.00] 1293; 1294; HASWELL-LABEL: v16f32_one_step2: 1295; HASWELL: # %bb.0: 1296; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1297; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1298; HASWELL-NEXT: vrcpps %ymm0, %ymm4 # sched: [11:2.00] 1299; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50] 1300; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50] 1301; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50] 1302; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50] 1303; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1304; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1305; HASWELL-NEXT: retq # sched: [7:1.00] 1306; 1307; HASWELL-NO-FMA-LABEL: v16f32_one_step2: 1308; HASWELL-NO-FMA: # %bb.0: 1309; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1310; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50] 1311; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1312; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1313; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1314; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1315; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1316; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50] 1317; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1318; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50] 1319; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1320; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1321; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1322; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1323; 1324; KNL-LABEL: v16f32_one_step2: 1325; KNL: # %bb.0: 1326; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1327; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] 1328; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] 1329; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] 1330; KNL-NEXT: retq # sched: [7:1.00] 1331; 1332; SKX-LABEL: v16f32_one_step2: 1333; SKX: # %bb.0: 1334; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1335; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] 1336; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50] 1337; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] 1338; SKX-NEXT: retq # sched: [7:1.00] 1339 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1340 ret <16 x float> %div 1341} 1342 1343define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { 1344; SSE-LABEL: v16f32_one_step_2_divs: 1345; SSE: # %bb.0: 1346; SSE-NEXT: rcpps %xmm0, %xmm6 1347; SSE-NEXT: mulps %xmm6, %xmm0 1348; SSE-NEXT: movaps {{.*#+}} xmm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1349; SSE-NEXT: movaps %xmm4, %xmm5 1350; SSE-NEXT: subps %xmm0, %xmm5 1351; SSE-NEXT: mulps %xmm6, %xmm5 1352; SSE-NEXT: addps %xmm6, %xmm5 1353; SSE-NEXT: rcpps %xmm1, %xmm0 1354; SSE-NEXT: mulps %xmm0, %xmm1 1355; SSE-NEXT: movaps %xmm4, %xmm6 1356; SSE-NEXT: subps %xmm1, %xmm6 1357; SSE-NEXT: mulps %xmm0, %xmm6 1358; SSE-NEXT: addps %xmm0, %xmm6 1359; SSE-NEXT: rcpps %xmm2, %xmm0 1360; SSE-NEXT: mulps %xmm0, %xmm2 1361; SSE-NEXT: movaps %xmm4, %xmm7 1362; SSE-NEXT: subps %xmm2, %xmm7 1363; SSE-NEXT: mulps %xmm0, %xmm7 1364; SSE-NEXT: addps %xmm0, %xmm7 1365; SSE-NEXT: rcpps %xmm3, %xmm0 1366; SSE-NEXT: mulps %xmm0, %xmm3 1367; SSE-NEXT: subps %xmm3, %xmm4 1368; SSE-NEXT: mulps %xmm0, %xmm4 1369; SSE-NEXT: addps %xmm0, %xmm4 1370; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01] 1371; SSE-NEXT: mulps %xmm4, %xmm3 1372; SSE-NEXT: movaps {{.*#+}} xmm2 = [9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01] 1373; SSE-NEXT: mulps %xmm7, %xmm2 1374; SSE-NEXT: movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00] 1375; SSE-NEXT: mulps %xmm6, %xmm1 1376; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 1377; SSE-NEXT: mulps %xmm5, %xmm0 1378; SSE-NEXT: mulps %xmm5, %xmm0 1379; SSE-NEXT: mulps %xmm6, %xmm1 1380; SSE-NEXT: mulps %xmm7, %xmm2 1381; SSE-NEXT: mulps %xmm4, %xmm3 1382; SSE-NEXT: retq 1383; 1384; AVX-RECIP-LABEL: v16f32_one_step_2_divs: 1385; AVX-RECIP: # %bb.0: 1386; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1387; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1388; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1389; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 1390; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1391; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1392; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1393; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1394; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1 1395; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1396; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1397; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 1398; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 1399; AVX-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0 1400; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1401; AVX-RECIP-NEXT: retq 1402; 1403; FMA-RECIP-LABEL: v16f32_one_step_2_divs: 1404; FMA-RECIP: # %bb.0: 1405; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1406; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1407; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 1408; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 1409; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1410; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 1411; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 1412; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 1413; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 1414; FMA-RECIP-NEXT: vmulps %ymm0, %ymm3, %ymm0 1415; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1416; FMA-RECIP-NEXT: retq 1417; 1418; BTVER2-LABEL: v16f32_one_step_2_divs: 1419; BTVER2: # %bb.0: 1420; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1421; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00] 1422; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00] 1423; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00] 1424; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00] 1425; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 1426; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00] 1427; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00] 1428; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00] 1429; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [7:2.00] 1430; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1431; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00] 1432; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [7:2.00] 1433; BTVER2-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [2:2.00] 1434; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1435; BTVER2-NEXT: retq # sched: [4:1.00] 1436; 1437; SANDY-LABEL: v16f32_one_step_2_divs: 1438; SANDY: # %bb.0: 1439; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 1440; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 1441; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1442; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1443; SANDY-NEXT: vrcpps %ymm1, %ymm4 # sched: [7:2.00] 1444; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 1445; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1446; SANDY-NEXT: vmulps %ymm4, %ymm1, %ymm1 # sched: [5:1.00] 1447; SANDY-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1448; SANDY-NEXT: vmulps %ymm1, %ymm4, %ymm1 # sched: [5:1.00] 1449; SANDY-NEXT: vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1450; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00] 1451; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00] 1452; SANDY-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00] 1453; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 1454; SANDY-NEXT: retq # sched: [1:1.00] 1455; 1456; HASWELL-LABEL: v16f32_one_step_2_divs: 1457; HASWELL: # %bb.0: 1458; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1459; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1460; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50] 1461; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50] 1462; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1463; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50] 1464; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50] 1465; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50] 1466; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50] 1467; HASWELL-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50] 1468; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1469; HASWELL-NEXT: retq # sched: [7:1.00] 1470; 1471; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs: 1472; HASWELL-NO-FMA: # %bb.0: 1473; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1474; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50] 1475; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1476; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00] 1477; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm4 # sched: [11:2.00] 1478; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50] 1479; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1480; HASWELL-NO-FMA-NEXT: vmulps %ymm4, %ymm1, %ymm1 # sched: [5:0.50] 1481; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00] 1482; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm4, %ymm1 # sched: [5:0.50] 1483; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1484; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50] 1485; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50] 1486; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50] 1487; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1488; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1489; 1490; KNL-LABEL: v16f32_one_step_2_divs: 1491; KNL: # %bb.0: 1492; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1493; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50] 1494; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50] 1495; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50] 1496; KNL-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50] 1497; KNL-NEXT: retq # sched: [7:1.00] 1498; 1499; SKX-LABEL: v16f32_one_step_2_divs: 1500; SKX: # %bb.0: 1501; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1502; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50] 1503; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50] 1504; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50] 1505; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.50] 1506; SKX-NEXT: retq # sched: [7:1.00] 1507 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1508 %div2 = fdiv fast <16 x float> %div, %x 1509 ret <16 x float> %div2 1510} 1511 1512define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { 1513; SSE-LABEL: v16f32_two_step2: 1514; SSE: # %bb.0: 1515; SSE-NEXT: movaps %xmm3, %xmm6 1516; SSE-NEXT: movaps %xmm2, %xmm5 1517; SSE-NEXT: movaps %xmm0, %xmm4 1518; SSE-NEXT: rcpps %xmm3, %xmm2 1519; SSE-NEXT: mulps %xmm2, %xmm3 1520; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1521; SSE-NEXT: movaps %xmm0, %xmm7 1522; SSE-NEXT: subps %xmm3, %xmm7 1523; SSE-NEXT: mulps %xmm2, %xmm7 1524; SSE-NEXT: addps %xmm2, %xmm7 1525; SSE-NEXT: mulps %xmm7, %xmm6 1526; SSE-NEXT: movaps %xmm0, %xmm3 1527; SSE-NEXT: subps %xmm6, %xmm3 1528; SSE-NEXT: mulps %xmm7, %xmm3 1529; SSE-NEXT: addps %xmm7, %xmm3 1530; SSE-NEXT: rcpps %xmm5, %xmm2 1531; SSE-NEXT: movaps %xmm5, %xmm6 1532; SSE-NEXT: mulps %xmm2, %xmm6 1533; SSE-NEXT: movaps %xmm0, %xmm7 1534; SSE-NEXT: subps %xmm6, %xmm7 1535; SSE-NEXT: mulps %xmm2, %xmm7 1536; SSE-NEXT: addps %xmm2, %xmm7 1537; SSE-NEXT: mulps %xmm7, %xmm5 1538; SSE-NEXT: movaps %xmm0, %xmm2 1539; SSE-NEXT: subps %xmm5, %xmm2 1540; SSE-NEXT: mulps %xmm7, %xmm2 1541; SSE-NEXT: addps %xmm7, %xmm2 1542; SSE-NEXT: rcpps %xmm1, %xmm5 1543; SSE-NEXT: movaps %xmm1, %xmm6 1544; SSE-NEXT: mulps %xmm5, %xmm6 1545; SSE-NEXT: movaps %xmm0, %xmm7 1546; SSE-NEXT: subps %xmm6, %xmm7 1547; SSE-NEXT: mulps %xmm5, %xmm7 1548; SSE-NEXT: addps %xmm5, %xmm7 1549; SSE-NEXT: mulps %xmm7, %xmm1 1550; SSE-NEXT: movaps %xmm0, %xmm5 1551; SSE-NEXT: subps %xmm1, %xmm5 1552; SSE-NEXT: mulps %xmm7, %xmm5 1553; SSE-NEXT: addps %xmm7, %xmm5 1554; SSE-NEXT: rcpps %xmm4, %xmm1 1555; SSE-NEXT: movaps %xmm4, %xmm6 1556; SSE-NEXT: mulps %xmm1, %xmm6 1557; SSE-NEXT: movaps %xmm0, %xmm7 1558; SSE-NEXT: subps %xmm6, %xmm7 1559; SSE-NEXT: mulps %xmm1, %xmm7 1560; SSE-NEXT: addps %xmm1, %xmm7 1561; SSE-NEXT: mulps %xmm7, %xmm4 1562; SSE-NEXT: subps %xmm4, %xmm0 1563; SSE-NEXT: mulps %xmm7, %xmm0 1564; SSE-NEXT: addps %xmm7, %xmm0 1565; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1566; SSE-NEXT: mulps {{.*}}(%rip), %xmm5 1567; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 1568; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 1569; SSE-NEXT: movaps %xmm5, %xmm1 1570; SSE-NEXT: retq 1571; 1572; AVX-RECIP-LABEL: v16f32_two_step2: 1573; AVX-RECIP: # %bb.0: 1574; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2 1575; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3 1576; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1577; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1578; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1579; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1580; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1 1581; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1 1582; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1 1583; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1 1584; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 1585; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 1586; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 1587; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 1588; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 1589; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 1590; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0 1591; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 1592; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 1593; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1594; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1595; AVX-RECIP-NEXT: retq 1596; 1597; FMA-RECIP-LABEL: v16f32_two_step2: 1598; FMA-RECIP: # %bb.0: 1599; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 1600; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] 1601; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1602; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 1603; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 1604; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 1605; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 1606; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 1607; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 1608; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 1609; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 1610; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 1611; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 1612; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1613; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1614; FMA-RECIP-NEXT: retq 1615; 1616; BTVER2-LABEL: v16f32_two_step2: 1617; BTVER2: # %bb.0: 1618; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00] 1619; BTVER2-NEXT: vrcpps %ymm1, %ymm2 # sched: [2:2.00] 1620; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00] 1621; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00] 1622; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00] 1623; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00] 1624; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00] 1625; BTVER2-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00] 1626; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00] 1627; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00] 1628; BTVER2-NEXT: vrcpps %ymm0, %ymm2 # sched: [2:2.00] 1629; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00] 1630; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00] 1631; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00] 1632; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00] 1633; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00] 1634; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00] 1635; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00] 1636; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00] 1637; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00] 1638; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1639; BTVER2-NEXT: retq # sched: [4:1.00] 1640; 1641; SANDY-LABEL: v16f32_two_step2: 1642; SANDY: # %bb.0: 1643; SANDY-NEXT: vrcpps %ymm1, %ymm2 # sched: [7:2.00] 1644; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00] 1645; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50] 1646; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1647; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00] 1648; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1649; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00] 1650; SANDY-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1651; SANDY-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00] 1652; SANDY-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1653; SANDY-NEXT: vrcpps %ymm0, %ymm2 # sched: [7:2.00] 1654; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00] 1655; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1656; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00] 1657; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1658; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00] 1659; SANDY-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00] 1660; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00] 1661; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1662; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1663; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00] 1664; SANDY-NEXT: retq # sched: [1:1.00] 1665; 1666; HASWELL-LABEL: v16f32_two_step2: 1667; HASWELL: # %bb.0: 1668; HASWELL-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1669; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1670; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00] 1671; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50] 1672; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50] 1673; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50] 1674; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50] 1675; HASWELL-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1676; HASWELL-NEXT: vmovaps %ymm2, %ymm4 # sched: [1:1.00] 1677; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50] 1678; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50] 1679; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50] 1680; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50] 1681; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1682; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1683; HASWELL-NEXT: retq # sched: [7:1.00] 1684; 1685; HASWELL-NO-FMA-LABEL: v16f32_two_step2: 1686; HASWELL-NO-FMA: # %bb.0: 1687; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm2 # sched: [11:2.00] 1688; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50] 1689; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50] 1690; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1691; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50] 1692; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1693; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50] 1694; HASWELL-NO-FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00] 1695; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50] 1696; HASWELL-NO-FMA-NEXT: vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00] 1697; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm2 # sched: [11:2.00] 1698; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm3 # sched: [5:0.50] 1699; HASWELL-NO-FMA-NEXT: vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00] 1700; HASWELL-NO-FMA-NEXT: vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50] 1701; HASWELL-NO-FMA-NEXT: vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00] 1702; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50] 1703; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00] 1704; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50] 1705; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00] 1706; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1707; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1708; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1709; 1710; KNL-LABEL: v16f32_two_step2: 1711; KNL: # %bb.0: 1712; KNL-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [11:2.00] 1713; KNL-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00] 1714; KNL-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:1.00] 1715; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50] 1716; KNL-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50] 1717; KNL-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50] 1718; KNL-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50] 1719; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] 1720; KNL-NEXT: retq # sched: [7:1.00] 1721; 1722; SKX-LABEL: v16f32_two_step2: 1723; SKX: # %bb.0: 1724; SKX-NEXT: vrcp14ps %zmm0, %zmm1 # sched: [4:2.00] 1725; SKX-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50] 1726; SKX-NEXT: vmovaps %zmm1, %zmm3 # sched: [1:0.33] 1727; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50] 1728; SKX-NEXT: vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50] 1729; SKX-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50] 1730; SKX-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50] 1731; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] 1732; SKX-NEXT: retq # sched: [7:1.00] 1733 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1734 ret <16 x float> %div 1735} 1736 1737define <16 x float> @v16f32_no_step(<16 x float> %x) #3 { 1738; SSE-LABEL: v16f32_no_step: 1739; SSE: # %bb.0: 1740; SSE-NEXT: rcpps %xmm0, %xmm0 1741; SSE-NEXT: rcpps %xmm1, %xmm1 1742; SSE-NEXT: rcpps %xmm2, %xmm2 1743; SSE-NEXT: rcpps %xmm3, %xmm3 1744; SSE-NEXT: retq 1745; 1746; AVX-RECIP-LABEL: v16f32_no_step: 1747; AVX-RECIP: # %bb.0: 1748; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1749; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 1750; AVX-RECIP-NEXT: retq 1751; 1752; FMA-RECIP-LABEL: v16f32_no_step: 1753; FMA-RECIP: # %bb.0: 1754; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1755; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 1756; FMA-RECIP-NEXT: retq 1757; 1758; BTVER2-LABEL: v16f32_no_step: 1759; BTVER2: # %bb.0: 1760; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1761; BTVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [2:2.00] 1762; BTVER2-NEXT: retq # sched: [4:1.00] 1763; 1764; SANDY-LABEL: v16f32_no_step: 1765; SANDY: # %bb.0: 1766; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1767; SANDY-NEXT: vrcpps %ymm1, %ymm1 # sched: [7:2.00] 1768; SANDY-NEXT: retq # sched: [1:1.00] 1769; 1770; HASWELL-LABEL: v16f32_no_step: 1771; HASWELL: # %bb.0: 1772; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1773; HASWELL-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1774; HASWELL-NEXT: retq # sched: [7:1.00] 1775; 1776; HASWELL-NO-FMA-LABEL: v16f32_no_step: 1777; HASWELL-NO-FMA: # %bb.0: 1778; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1779; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1780; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1781; 1782; KNL-LABEL: v16f32_no_step: 1783; KNL: # %bb.0: 1784; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [11:2.00] 1785; KNL-NEXT: retq # sched: [7:1.00] 1786; 1787; SKX-LABEL: v16f32_no_step: 1788; SKX: # %bb.0: 1789; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [4:2.00] 1790; SKX-NEXT: retq # sched: [7:1.00] 1791 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x 1792 ret <16 x float> %div 1793} 1794 1795define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 { 1796; SSE-LABEL: v16f32_no_step2: 1797; SSE: # %bb.0: 1798; SSE-NEXT: rcpps %xmm3, %xmm3 1799; SSE-NEXT: rcpps %xmm2, %xmm2 1800; SSE-NEXT: rcpps %xmm1, %xmm1 1801; SSE-NEXT: rcpps %xmm0, %xmm0 1802; SSE-NEXT: mulps {{.*}}(%rip), %xmm0 1803; SSE-NEXT: mulps {{.*}}(%rip), %xmm1 1804; SSE-NEXT: mulps {{.*}}(%rip), %xmm2 1805; SSE-NEXT: mulps {{.*}}(%rip), %xmm3 1806; SSE-NEXT: retq 1807; 1808; AVX-RECIP-LABEL: v16f32_no_step2: 1809; AVX-RECIP: # %bb.0: 1810; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm1 1811; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0 1812; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1813; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1814; AVX-RECIP-NEXT: retq 1815; 1816; FMA-RECIP-LABEL: v16f32_no_step2: 1817; FMA-RECIP: # %bb.0: 1818; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm1 1819; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0 1820; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 1821; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 1822; FMA-RECIP-NEXT: retq 1823; 1824; BTVER2-LABEL: v16f32_no_step2: 1825; BTVER2: # %bb.0: 1826; BTVER2-NEXT: vrcpps %ymm1, %ymm1 # sched: [2:2.00] 1827; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00] 1828; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00] 1829; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00] 1830; BTVER2-NEXT: retq # sched: [4:1.00] 1831; 1832; SANDY-LABEL: v16f32_no_step2: 1833; SANDY: # %bb.0: 1834; SANDY-NEXT: vrcpps %ymm1, %ymm1 # sched: [7:2.00] 1835; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00] 1836; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00] 1837; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00] 1838; SANDY-NEXT: retq # sched: [1:1.00] 1839; 1840; HASWELL-LABEL: v16f32_no_step2: 1841; HASWELL: # %bb.0: 1842; HASWELL-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1843; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1844; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1845; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1846; HASWELL-NEXT: retq # sched: [7:1.00] 1847; 1848; HASWELL-NO-FMA-LABEL: v16f32_no_step2: 1849; HASWELL-NO-FMA: # %bb.0: 1850; HASWELL-NO-FMA-NEXT: vrcpps %ymm1, %ymm1 # sched: [11:2.00] 1851; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00] 1852; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50] 1853; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50] 1854; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00] 1855; 1856; KNL-LABEL: v16f32_no_step2: 1857; KNL: # %bb.0: 1858; KNL-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [11:2.00] 1859; KNL-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50] 1860; KNL-NEXT: retq # sched: [7:1.00] 1861; 1862; SKX-LABEL: v16f32_no_step2: 1863; SKX: # %bb.0: 1864; SKX-NEXT: vrcp14ps %zmm0, %zmm0 # sched: [4:2.00] 1865; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] 1866; SKX-NEXT: retq # sched: [7:1.00] 1867 %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x 1868 ret <16 x float> %div 1869} 1870 1871attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } 1872attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } 1873attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } 1874attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" } 1875 1876