• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
11
12; If the target's divss/divps instructions are substantially
13; slower than rcpss/rcpps with a Newton-Raphson refinement,
14; we should generate the estimate sequence.
15
16; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
17; for details about the accuracy, speed, and implementation
18; differences of x86 reciprocal estimates.
19
20define float @f32_no_estimate(float %x) #0 {
21; SSE-LABEL: f32_no_estimate:
22; SSE:       # %bb.0:
23; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
24; SSE-NEXT:    divss %xmm0, %xmm1
25; SSE-NEXT:    movaps %xmm1, %xmm0
26; SSE-NEXT:    retq
27;
28; AVX-RECIP-LABEL: f32_no_estimate:
29; AVX-RECIP:       # %bb.0:
30; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
31; AVX-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
32; AVX-RECIP-NEXT:    retq
33;
34; FMA-RECIP-LABEL: f32_no_estimate:
35; FMA-RECIP:       # %bb.0:
36; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
37; FMA-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
38; FMA-RECIP-NEXT:    retq
39;
40; BTVER2-LABEL: f32_no_estimate:
41; BTVER2:       # %bb.0:
42; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
43; BTVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
44; BTVER2-NEXT:    retq # sched: [4:1.00]
45;
46; SANDY-LABEL: f32_no_estimate:
47; SANDY:       # %bb.0:
48; SANDY-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
49; SANDY-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
50; SANDY-NEXT:    retq # sched: [1:1.00]
51;
52; HASWELL-LABEL: f32_no_estimate:
53; HASWELL:       # %bb.0:
54; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
55; HASWELL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
56; HASWELL-NEXT:    retq # sched: [7:1.00]
57;
58; HASWELL-NO-FMA-LABEL: f32_no_estimate:
59; HASWELL-NO-FMA:       # %bb.0:
60; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
61; HASWELL-NO-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
62; HASWELL-NO-FMA-NEXT:    retq
63;
64; KNL-LABEL: f32_no_estimate:
65; KNL:       # %bb.0:
66; KNL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
67; KNL-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
68; KNL-NEXT:    retq # sched: [7:1.00]
69;
70; SKX-LABEL: f32_no_estimate:
71; SKX:       # %bb.0:
72; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
73; SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
74; SKX-NEXT:    retq # sched: [7:1.00]
75  %div = fdiv fast float 1.0, %x
76  ret float %div
77}
78
79define float @f32_one_step(float %x) #1 {
80; SSE-LABEL: f32_one_step:
81; SSE:       # %bb.0:
82; SSE-NEXT:    rcpss %xmm0, %xmm2
83; SSE-NEXT:    mulss %xmm2, %xmm0
84; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
85; SSE-NEXT:    subss %xmm0, %xmm1
86; SSE-NEXT:    mulss %xmm2, %xmm1
87; SSE-NEXT:    addss %xmm2, %xmm1
88; SSE-NEXT:    movaps %xmm1, %xmm0
89; SSE-NEXT:    retq
90;
91; AVX-RECIP-LABEL: f32_one_step:
92; AVX-RECIP:       # %bb.0:
93; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
94; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
95; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
96; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
97; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
98; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
99; AVX-RECIP-NEXT:    retq
100;
101; FMA-RECIP-LABEL: f32_one_step:
102; FMA-RECIP:       # %bb.0:
103; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
104; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
105; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
106; FMA-RECIP-NEXT:    retq
107;
108; BTVER2-LABEL: f32_one_step:
109; BTVER2:       # %bb.0:
110; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
111; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
112; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
113; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
114; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
115; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
116; BTVER2-NEXT:    retq # sched: [4:1.00]
117;
118; SANDY-LABEL: f32_one_step:
119; SANDY:       # %bb.0:
120; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
121; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
122; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
123; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
124; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
125; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
126; SANDY-NEXT:    retq # sched: [1:1.00]
127;
128; HASWELL-LABEL: f32_one_step:
129; HASWELL:       # %bb.0:
130; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
131; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
132; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
133; HASWELL-NEXT:    retq # sched: [7:1.00]
134;
135; HASWELL-NO-FMA-LABEL: f32_one_step:
136; HASWELL-NO-FMA:       # %bb.0:
137; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
138; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
139; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
140; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
141; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
142; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
143; HASWELL-NO-FMA-NEXT:    retq
144;
145; KNL-LABEL: f32_one_step:
146; KNL:       # %bb.0:
147; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
148; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
149; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
150; KNL-NEXT:    retq # sched: [7:1.00]
151;
152; SKX-LABEL: f32_one_step:
153; SKX:       # %bb.0:
154; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
155; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
156; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
157; SKX-NEXT:    retq # sched: [7:1.00]
158  %div = fdiv fast float 1.0, %x
159  ret float %div
160}
161
162define float @f32_two_step(float %x) #2 {
163; SSE-LABEL: f32_two_step:
164; SSE:       # %bb.0:
165; SSE-NEXT:    rcpss %xmm0, %xmm2
166; SSE-NEXT:    movaps %xmm0, %xmm3
167; SSE-NEXT:    mulss %xmm2, %xmm3
168; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
169; SSE-NEXT:    movaps %xmm1, %xmm4
170; SSE-NEXT:    subss %xmm3, %xmm4
171; SSE-NEXT:    mulss %xmm2, %xmm4
172; SSE-NEXT:    addss %xmm2, %xmm4
173; SSE-NEXT:    mulss %xmm4, %xmm0
174; SSE-NEXT:    subss %xmm0, %xmm1
175; SSE-NEXT:    mulss %xmm4, %xmm1
176; SSE-NEXT:    addss %xmm4, %xmm1
177; SSE-NEXT:    movaps %xmm1, %xmm0
178; SSE-NEXT:    retq
179;
180; AVX-RECIP-LABEL: f32_two_step:
181; AVX-RECIP:       # %bb.0:
182; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
183; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
184; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
185; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
186; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
187; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
188; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
189; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
190; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
191; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
192; AVX-RECIP-NEXT:    retq
193;
194; FMA-RECIP-LABEL: f32_two_step:
195; FMA-RECIP:       # %bb.0:
196; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
197; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
198; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
199; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
200; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
201; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
202; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
203; FMA-RECIP-NEXT:    retq
204;
205; BTVER2-LABEL: f32_two_step:
206; BTVER2:       # %bb.0:
207; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
208; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
209; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
210; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
211; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
212; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
213; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
214; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
215; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
216; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
217; BTVER2-NEXT:    retq # sched: [4:1.00]
218;
219; SANDY-LABEL: f32_two_step:
220; SANDY:       # %bb.0:
221; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
222; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
223; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
224; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
225; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
226; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
227; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
228; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
229; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
230; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
231; SANDY-NEXT:    retq # sched: [1:1.00]
232;
233; HASWELL-LABEL: f32_two_step:
234; HASWELL:       # %bb.0:
235; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
236; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
237; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
238; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
239; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
240; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
241; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
242; HASWELL-NEXT:    retq # sched: [7:1.00]
243;
244; HASWELL-NO-FMA-LABEL: f32_two_step:
245; HASWELL-NO-FMA:       # %bb.0:
246; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
247; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
248; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
249; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
250; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
251; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
252; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
253; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
254; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
255; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
256; HASWELL-NO-FMA-NEXT:    retq
257;
258; KNL-LABEL: f32_two_step:
259; KNL:       # %bb.0:
260; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
261; KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
262; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
263; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
264; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
265; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
266; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
267; KNL-NEXT:    retq # sched: [7:1.00]
268;
269; SKX-LABEL: f32_two_step:
270; SKX:       # %bb.0:
271; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
272; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
273; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
274; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
275; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
276; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
277; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
278; SKX-NEXT:    retq # sched: [7:1.00]
279  %div = fdiv fast float 1.0, %x
280  ret float %div
281}
282
283define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
284; SSE-LABEL: v4f32_no_estimate:
285; SSE:       # %bb.0:
286; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
287; SSE-NEXT:    divps %xmm0, %xmm1
288; SSE-NEXT:    movaps %xmm1, %xmm0
289; SSE-NEXT:    retq
290;
291; AVX-RECIP-LABEL: v4f32_no_estimate:
292; AVX-RECIP:       # %bb.0:
293; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
294; AVX-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
295; AVX-RECIP-NEXT:    retq
296;
297; FMA-RECIP-LABEL: v4f32_no_estimate:
298; FMA-RECIP:       # %bb.0:
299; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
300; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
301; FMA-RECIP-NEXT:    retq
302;
303; BTVER2-LABEL: v4f32_no_estimate:
304; BTVER2:       # %bb.0:
305; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
306; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
307; BTVER2-NEXT:    retq # sched: [4:1.00]
308;
309; SANDY-LABEL: v4f32_no_estimate:
310; SANDY:       # %bb.0:
311; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
312; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
313; SANDY-NEXT:    retq # sched: [1:1.00]
314;
315; HASWELL-LABEL: v4f32_no_estimate:
316; HASWELL:       # %bb.0:
317; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
318; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
319; HASWELL-NEXT:    retq # sched: [7:1.00]
320;
321; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
322; HASWELL-NO-FMA:       # %bb.0:
323; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
324; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
325; HASWELL-NO-FMA-NEXT:    retq
326;
327; KNL-LABEL: v4f32_no_estimate:
328; KNL:       # %bb.0:
329; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
330; KNL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
331; KNL-NEXT:    retq # sched: [7:1.00]
332;
333; SKX-LABEL: v4f32_no_estimate:
334; SKX:       # %bb.0:
335; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
336; SKX-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
337; SKX-NEXT:    retq # sched: [7:1.00]
338  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
339  ret <4 x float> %div
340}
341
342define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
343; SSE-LABEL: v4f32_one_step:
344; SSE:       # %bb.0:
345; SSE-NEXT:    rcpps %xmm0, %xmm2
346; SSE-NEXT:    mulps %xmm2, %xmm0
347; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
348; SSE-NEXT:    subps %xmm0, %xmm1
349; SSE-NEXT:    mulps %xmm2, %xmm1
350; SSE-NEXT:    addps %xmm2, %xmm1
351; SSE-NEXT:    movaps %xmm1, %xmm0
352; SSE-NEXT:    retq
353;
354; AVX-RECIP-LABEL: v4f32_one_step:
355; AVX-RECIP:       # %bb.0:
356; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
357; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
358; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
359; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
360; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
361; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
362; AVX-RECIP-NEXT:    retq
363;
364; FMA-RECIP-LABEL: v4f32_one_step:
365; FMA-RECIP:       # %bb.0:
366; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
367; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
368; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
369; FMA-RECIP-NEXT:    retq
370;
371; BTVER2-LABEL: v4f32_one_step:
372; BTVER2:       # %bb.0:
373; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
374; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
375; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
376; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
377; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
378; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
379; BTVER2-NEXT:    retq # sched: [4:1.00]
380;
381; SANDY-LABEL: v4f32_one_step:
382; SANDY:       # %bb.0:
383; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
384; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
385; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
386; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
387; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
388; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
389; SANDY-NEXT:    retq # sched: [1:1.00]
390;
391; HASWELL-LABEL: v4f32_one_step:
392; HASWELL:       # %bb.0:
393; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
394; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
395; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
396; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
397; HASWELL-NEXT:    retq # sched: [7:1.00]
398;
399; HASWELL-NO-FMA-LABEL: v4f32_one_step:
400; HASWELL-NO-FMA:       # %bb.0:
401; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
402; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
403; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
404; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
405; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
406; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
407; HASWELL-NO-FMA-NEXT:    retq
408;
409; KNL-LABEL: v4f32_one_step:
410; KNL:       # %bb.0:
411; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
412; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
413; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
414; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
415; KNL-NEXT:    retq # sched: [7:1.00]
416;
417; SKX-LABEL: v4f32_one_step:
418; SKX:       # %bb.0:
419; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
420; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
421; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
422; SKX-NEXT:    retq # sched: [7:1.00]
423  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
424  ret <4 x float> %div
425}
426
427define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
428; SSE-LABEL: v4f32_two_step:
429; SSE:       # %bb.0:
430; SSE-NEXT:    rcpps %xmm0, %xmm2
431; SSE-NEXT:    movaps %xmm0, %xmm3
432; SSE-NEXT:    mulps %xmm2, %xmm3
433; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
434; SSE-NEXT:    movaps %xmm1, %xmm4
435; SSE-NEXT:    subps %xmm3, %xmm4
436; SSE-NEXT:    mulps %xmm2, %xmm4
437; SSE-NEXT:    addps %xmm2, %xmm4
438; SSE-NEXT:    mulps %xmm4, %xmm0
439; SSE-NEXT:    subps %xmm0, %xmm1
440; SSE-NEXT:    mulps %xmm4, %xmm1
441; SSE-NEXT:    addps %xmm4, %xmm1
442; SSE-NEXT:    movaps %xmm1, %xmm0
443; SSE-NEXT:    retq
444;
445; AVX-RECIP-LABEL: v4f32_two_step:
446; AVX-RECIP:       # %bb.0:
447; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
448; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
449; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
450; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
451; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
452; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
453; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
454; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
455; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
456; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
457; AVX-RECIP-NEXT:    retq
458;
459; FMA-RECIP-LABEL: v4f32_two_step:
460; FMA-RECIP:       # %bb.0:
461; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
462; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
463; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
464; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
465; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
466; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
467; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
468; FMA-RECIP-NEXT:    retq
469;
470; BTVER2-LABEL: v4f32_two_step:
471; BTVER2:       # %bb.0:
472; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
473; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
474; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
475; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
476; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
477; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
478; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
479; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
480; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
481; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
482; BTVER2-NEXT:    retq # sched: [4:1.00]
483;
484; SANDY-LABEL: v4f32_two_step:
485; SANDY:       # %bb.0:
486; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
487; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
488; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
489; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
490; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
491; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
492; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
493; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
494; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
495; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
496; SANDY-NEXT:    retq # sched: [1:1.00]
497;
498; HASWELL-LABEL: v4f32_two_step:
499; HASWELL:       # %bb.0:
500; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
501; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
502; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
503; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
504; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
505; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
506; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
507; HASWELL-NEXT:    retq # sched: [7:1.00]
508;
509; HASWELL-NO-FMA-LABEL: v4f32_two_step:
510; HASWELL-NO-FMA:       # %bb.0:
511; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
512; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
513; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
514; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
515; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
516; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
517; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
518; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
519; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
520; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
521; HASWELL-NO-FMA-NEXT:    retq
522;
523; KNL-LABEL: v4f32_two_step:
524; KNL:       # %bb.0:
525; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
526; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
527; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
528; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
529; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
530; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
531; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
532; KNL-NEXT:    retq # sched: [7:1.00]
533;
534; SKX-LABEL: v4f32_two_step:
535; SKX:       # %bb.0:
536; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
537; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
538; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
539; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
540; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
541; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
542; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
543; SKX-NEXT:    retq # sched: [7:1.00]
544  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
545  ret <4 x float> %div
546}
547
548define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
549; SSE-LABEL: v8f32_no_estimate:
550; SSE:       # %bb.0:
551; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
552; SSE-NEXT:    movaps %xmm2, %xmm3
553; SSE-NEXT:    divps %xmm0, %xmm3
554; SSE-NEXT:    divps %xmm1, %xmm2
555; SSE-NEXT:    movaps %xmm3, %xmm0
556; SSE-NEXT:    movaps %xmm2, %xmm1
557; SSE-NEXT:    retq
558;
559; AVX-RECIP-LABEL: v8f32_no_estimate:
560; AVX-RECIP:       # %bb.0:
561; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
562; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
563; AVX-RECIP-NEXT:    retq
564;
565; FMA-RECIP-LABEL: v8f32_no_estimate:
566; FMA-RECIP:       # %bb.0:
567; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
568; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
569; FMA-RECIP-NEXT:    retq
570;
571; BTVER2-LABEL: v8f32_no_estimate:
572; BTVER2:       # %bb.0:
573; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
574; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
575; BTVER2-NEXT:    retq # sched: [4:1.00]
576;
577; SANDY-LABEL: v8f32_no_estimate:
578; SANDY:       # %bb.0:
579; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
580; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00]
581; SANDY-NEXT:    retq # sched: [1:1.00]
582;
583; HASWELL-LABEL: v8f32_no_estimate:
584; HASWELL:       # %bb.0:
585; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
586; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
587; HASWELL-NEXT:    retq # sched: [7:1.00]
588;
589; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
590; HASWELL-NO-FMA:       # %bb.0:
591; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
592; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
593; HASWELL-NO-FMA-NEXT:    retq
594;
595; KNL-LABEL: v8f32_no_estimate:
596; KNL:       # %bb.0:
597; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
598; KNL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
599; KNL-NEXT:    retq # sched: [7:1.00]
600;
601; SKX-LABEL: v8f32_no_estimate:
602; SKX:       # %bb.0:
603; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
604; SKX-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [11:5.00]
605; SKX-NEXT:    retq # sched: [7:1.00]
606  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
607  ret <8 x float> %div
608}
609
610define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
611; SSE-LABEL: v8f32_one_step:
612; SSE:       # %bb.0:
613; SSE-NEXT:    rcpps %xmm0, %xmm4
614; SSE-NEXT:    mulps %xmm4, %xmm0
615; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
616; SSE-NEXT:    movaps %xmm2, %xmm3
617; SSE-NEXT:    subps %xmm0, %xmm3
618; SSE-NEXT:    mulps %xmm4, %xmm3
619; SSE-NEXT:    addps %xmm4, %xmm3
620; SSE-NEXT:    rcpps %xmm1, %xmm0
621; SSE-NEXT:    mulps %xmm0, %xmm1
622; SSE-NEXT:    subps %xmm1, %xmm2
623; SSE-NEXT:    mulps %xmm0, %xmm2
624; SSE-NEXT:    addps %xmm0, %xmm2
625; SSE-NEXT:    movaps %xmm3, %xmm0
626; SSE-NEXT:    movaps %xmm2, %xmm1
627; SSE-NEXT:    retq
628;
629; AVX-RECIP-LABEL: v8f32_one_step:
630; AVX-RECIP:       # %bb.0:
631; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
632; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
633; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
634; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
635; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
636; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
637; AVX-RECIP-NEXT:    retq
638;
639; FMA-RECIP-LABEL: v8f32_one_step:
640; FMA-RECIP:       # %bb.0:
641; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
642; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
643; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
644; FMA-RECIP-NEXT:    retq
645;
646; BTVER2-LABEL: v8f32_one_step:
647; BTVER2:       # %bb.0:
648; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
649; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
650; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
651; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
652; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
653; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
654; BTVER2-NEXT:    retq # sched: [4:1.00]
655;
656; SANDY-LABEL: v8f32_one_step:
657; SANDY:       # %bb.0:
658; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
659; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
660; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
661; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
662; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
663; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
664; SANDY-NEXT:    retq # sched: [1:1.00]
665;
666; HASWELL-LABEL: v8f32_one_step:
667; HASWELL:       # %bb.0:
668; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
669; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
670; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
671; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
672; HASWELL-NEXT:    retq # sched: [7:1.00]
673;
674; HASWELL-NO-FMA-LABEL: v8f32_one_step:
675; HASWELL-NO-FMA:       # %bb.0:
676; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
677; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
678; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
679; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
680; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
681; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
682; HASWELL-NO-FMA-NEXT:    retq
683;
684; KNL-LABEL: v8f32_one_step:
685; KNL:       # %bb.0:
686; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
687; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
688; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
689; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
690; KNL-NEXT:    retq # sched: [7:1.00]
691;
692; SKX-LABEL: v8f32_one_step:
693; SKX:       # %bb.0:
694; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
695; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
696; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
697; SKX-NEXT:    retq # sched: [7:1.00]
698  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
699  ret <8 x float> %div
700}
701
702define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
703; SSE-LABEL: v8f32_two_step:
704; SSE:       # %bb.0:
705; SSE-NEXT:    movaps %xmm1, %xmm2
706; SSE-NEXT:    rcpps %xmm0, %xmm3
707; SSE-NEXT:    movaps %xmm0, %xmm4
708; SSE-NEXT:    mulps %xmm3, %xmm4
709; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
710; SSE-NEXT:    movaps %xmm1, %xmm5
711; SSE-NEXT:    subps %xmm4, %xmm5
712; SSE-NEXT:    mulps %xmm3, %xmm5
713; SSE-NEXT:    addps %xmm3, %xmm5
714; SSE-NEXT:    mulps %xmm5, %xmm0
715; SSE-NEXT:    movaps %xmm1, %xmm3
716; SSE-NEXT:    subps %xmm0, %xmm3
717; SSE-NEXT:    mulps %xmm5, %xmm3
718; SSE-NEXT:    addps %xmm5, %xmm3
719; SSE-NEXT:    rcpps %xmm2, %xmm0
720; SSE-NEXT:    movaps %xmm2, %xmm4
721; SSE-NEXT:    mulps %xmm0, %xmm4
722; SSE-NEXT:    movaps %xmm1, %xmm5
723; SSE-NEXT:    subps %xmm4, %xmm5
724; SSE-NEXT:    mulps %xmm0, %xmm5
725; SSE-NEXT:    addps %xmm0, %xmm5
726; SSE-NEXT:    mulps %xmm5, %xmm2
727; SSE-NEXT:    subps %xmm2, %xmm1
728; SSE-NEXT:    mulps %xmm5, %xmm1
729; SSE-NEXT:    addps %xmm5, %xmm1
730; SSE-NEXT:    movaps %xmm3, %xmm0
731; SSE-NEXT:    retq
732;
733; AVX-RECIP-LABEL: v8f32_two_step:
734; AVX-RECIP:       # %bb.0:
735; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
736; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
737; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
738; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
739; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
740; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
741; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
742; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
743; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
744; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
745; AVX-RECIP-NEXT:    retq
746;
747; FMA-RECIP-LABEL: v8f32_two_step:
748; FMA-RECIP:       # %bb.0:
749; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
750; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
751; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
752; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
753; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
754; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
755; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
756; FMA-RECIP-NEXT:    retq
757;
758; BTVER2-LABEL: v8f32_two_step:
759; BTVER2:       # %bb.0:
760; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
761; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
762; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
763; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
764; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
765; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
766; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
767; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
768; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
769; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
770; BTVER2-NEXT:    retq # sched: [4:1.00]
771;
772; SANDY-LABEL: v8f32_two_step:
773; SANDY:       # %bb.0:
774; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
775; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
776; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
777; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
778; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
779; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
780; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
781; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
782; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
783; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
784; SANDY-NEXT:    retq # sched: [1:1.00]
785;
786; HASWELL-LABEL: v8f32_two_step:
787; HASWELL:       # %bb.0:
788; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
789; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
790; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
791; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
792; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
793; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
794; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
795; HASWELL-NEXT:    retq # sched: [7:1.00]
796;
797; HASWELL-NO-FMA-LABEL: v8f32_two_step:
798; HASWELL-NO-FMA:       # %bb.0:
799; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
800; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
801; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
802; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
803; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
804; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
805; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
806; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
807; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
808; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
809; HASWELL-NO-FMA-NEXT:    retq
810;
811; KNL-LABEL: v8f32_two_step:
812; KNL:       # %bb.0:
813; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
814; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
815; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
816; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
817; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
818; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
819; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
820; KNL-NEXT:    retq # sched: [7:1.00]
821;
822; SKX-LABEL: v8f32_two_step:
823; SKX:       # %bb.0:
824; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
825; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
826; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
827; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
828; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
829; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50]
830; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50]
831; SKX-NEXT:    retq # sched: [7:1.00]
832  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
833  ret <8 x float> %div
834}
835
836define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
837; SSE-LABEL: v16f32_no_estimate:
838; SSE:       # %bb.0:
839; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
840; SSE-NEXT:    movaps %xmm4, %xmm5
841; SSE-NEXT:    divps %xmm0, %xmm5
842; SSE-NEXT:    movaps %xmm4, %xmm6
843; SSE-NEXT:    divps %xmm1, %xmm6
844; SSE-NEXT:    movaps %xmm4, %xmm7
845; SSE-NEXT:    divps %xmm2, %xmm7
846; SSE-NEXT:    divps %xmm3, %xmm4
847; SSE-NEXT:    movaps %xmm5, %xmm0
848; SSE-NEXT:    movaps %xmm6, %xmm1
849; SSE-NEXT:    movaps %xmm7, %xmm2
850; SSE-NEXT:    movaps %xmm4, %xmm3
851; SSE-NEXT:    retq
852;
853; AVX-RECIP-LABEL: v16f32_no_estimate:
854; AVX-RECIP:       # %bb.0:
855; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
856; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
857; AVX-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
858; AVX-RECIP-NEXT:    retq
859;
860; FMA-RECIP-LABEL: v16f32_no_estimate:
861; FMA-RECIP:       # %bb.0:
862; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
863; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
864; FMA-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
865; FMA-RECIP-NEXT:    retq
866;
867; BTVER2-LABEL: v16f32_no_estimate:
868; BTVER2:       # %bb.0:
869; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
870; BTVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [38:38.00]
871; BTVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [38:38.00]
872; BTVER2-NEXT:    retq # sched: [4:1.00]
873;
874; SANDY-LABEL: v16f32_no_estimate:
875; SANDY:       # %bb.0:
876; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
877; SANDY-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00]
878; SANDY-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00]
879; SANDY-NEXT:    retq # sched: [1:1.00]
880;
881; HASWELL-LABEL: v16f32_no_estimate:
882; HASWELL:       # %bb.0:
883; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
884; HASWELL-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [21:14.00]
885; HASWELL-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [21:14.00]
886; HASWELL-NEXT:    retq # sched: [7:1.00]
887;
888; HASWELL-NO-FMA-LABEL: v16f32_no_estimate:
889; HASWELL-NO-FMA:       # %bb.0:
890; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
891; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm2, %ymm0
892; HASWELL-NO-FMA-NEXT:    vdivps %ymm1, %ymm2, %ymm1
893; HASWELL-NO-FMA-NEXT:    retq
894;
895; KNL-LABEL: v16f32_no_estimate:
896; KNL:       # %bb.0:
897; KNL-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
898; KNL-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [21:14.00]
899; KNL-NEXT:    retq # sched: [7:1.00]
900;
901; SKX-LABEL: v16f32_no_estimate:
902; SKX:       # %bb.0:
903; SKX-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
904; SKX-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
905; SKX-NEXT:    retq # sched: [7:1.00]
906  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
907  ret <16 x float> %div
908}
909
910define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
911; SSE-LABEL: v16f32_one_step:
912; SSE:       # %bb.0:
913; SSE-NEXT:    movaps %xmm3, %xmm4
914; SSE-NEXT:    movaps %xmm0, %xmm5
915; SSE-NEXT:    rcpps %xmm0, %xmm6
916; SSE-NEXT:    mulps %xmm6, %xmm5
917; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
918; SSE-NEXT:    movaps %xmm3, %xmm0
919; SSE-NEXT:    subps %xmm5, %xmm0
920; SSE-NEXT:    mulps %xmm6, %xmm0
921; SSE-NEXT:    addps %xmm6, %xmm0
922; SSE-NEXT:    rcpps %xmm1, %xmm6
923; SSE-NEXT:    mulps %xmm6, %xmm1
924; SSE-NEXT:    movaps %xmm3, %xmm5
925; SSE-NEXT:    subps %xmm1, %xmm5
926; SSE-NEXT:    mulps %xmm6, %xmm5
927; SSE-NEXT:    addps %xmm6, %xmm5
928; SSE-NEXT:    rcpps %xmm2, %xmm1
929; SSE-NEXT:    mulps %xmm1, %xmm2
930; SSE-NEXT:    movaps %xmm3, %xmm6
931; SSE-NEXT:    subps %xmm2, %xmm6
932; SSE-NEXT:    mulps %xmm1, %xmm6
933; SSE-NEXT:    addps %xmm1, %xmm6
934; SSE-NEXT:    rcpps %xmm4, %xmm1
935; SSE-NEXT:    mulps %xmm1, %xmm4
936; SSE-NEXT:    subps %xmm4, %xmm3
937; SSE-NEXT:    mulps %xmm1, %xmm3
938; SSE-NEXT:    addps %xmm1, %xmm3
939; SSE-NEXT:    movaps %xmm5, %xmm1
940; SSE-NEXT:    movaps %xmm6, %xmm2
941; SSE-NEXT:    retq
942;
943; AVX-RECIP-LABEL: v16f32_one_step:
944; AVX-RECIP:       # %bb.0:
945; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
946; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
947; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
948; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
949; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
950; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
951; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
952; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
953; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
954; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
955; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
956; AVX-RECIP-NEXT:    retq
957;
958; FMA-RECIP-LABEL: v16f32_one_step:
959; FMA-RECIP:       # %bb.0:
960; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
961; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
962; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
963; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
964; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
965; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
966; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
967; FMA-RECIP-NEXT:    retq
968;
969; BTVER2-LABEL: v16f32_one_step:
970; BTVER2:       # %bb.0:
971; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
972; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
973; BTVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [2:2.00]
974; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
975; BTVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [2:2.00]
976; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
977; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
978; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
979; BTVER2-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [2:2.00]
980; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
981; BTVER2-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
982; BTVER2-NEXT:    retq # sched: [4:1.00]
983;
984; SANDY-LABEL: v16f32_one_step:
985; SANDY:       # %bb.0:
986; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
987; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
988; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
989; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
990; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
991; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
992; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
993; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
994; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
995; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
996; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
997; SANDY-NEXT:    retq # sched: [1:1.00]
998;
999; HASWELL-LABEL: v16f32_one_step:
1000; HASWELL:       # %bb.0:
1001; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1002; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1003; HASWELL-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
1004; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
1005; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
1006; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
1007; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
1008; HASWELL-NEXT:    retq # sched: [7:1.00]
1009;
1010; HASWELL-NO-FMA-LABEL: v16f32_one_step:
1011; HASWELL-NO-FMA:       # %bb.0:
1012; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
1013; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1014; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
1015; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1016; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1017; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1018; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
1019; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1020; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1021; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1022; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1023; HASWELL-NO-FMA-NEXT:    retq
1024;
1025; KNL-LABEL: v16f32_one_step:
1026; KNL:       # %bb.0:
1027; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1028; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1029; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1030; KNL-NEXT:    retq # sched: [7:1.00]
1031;
1032; SKX-LABEL: v16f32_one_step:
1033; SKX:       # %bb.0:
1034; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1035; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1036; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
1037; SKX-NEXT:    retq # sched: [7:1.00]
1038  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1039  ret <16 x float> %div
1040}
1041
1042define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
1043; SSE-LABEL: v16f32_two_step:
1044; SSE:       # %bb.0:
1045; SSE-NEXT:    movaps %xmm3, %xmm4
1046; SSE-NEXT:    movaps %xmm1, %xmm5
1047; SSE-NEXT:    movaps %xmm0, %xmm1
1048; SSE-NEXT:    rcpps %xmm0, %xmm0
1049; SSE-NEXT:    movaps %xmm1, %xmm6
1050; SSE-NEXT:    mulps %xmm0, %xmm6
1051; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1052; SSE-NEXT:    movaps %xmm3, %xmm7
1053; SSE-NEXT:    subps %xmm6, %xmm7
1054; SSE-NEXT:    mulps %xmm0, %xmm7
1055; SSE-NEXT:    addps %xmm0, %xmm7
1056; SSE-NEXT:    mulps %xmm7, %xmm1
1057; SSE-NEXT:    movaps %xmm3, %xmm0
1058; SSE-NEXT:    subps %xmm1, %xmm0
1059; SSE-NEXT:    mulps %xmm7, %xmm0
1060; SSE-NEXT:    addps %xmm7, %xmm0
1061; SSE-NEXT:    rcpps %xmm5, %xmm1
1062; SSE-NEXT:    movaps %xmm5, %xmm6
1063; SSE-NEXT:    mulps %xmm1, %xmm6
1064; SSE-NEXT:    movaps %xmm3, %xmm7
1065; SSE-NEXT:    subps %xmm6, %xmm7
1066; SSE-NEXT:    mulps %xmm1, %xmm7
1067; SSE-NEXT:    addps %xmm1, %xmm7
1068; SSE-NEXT:    mulps %xmm7, %xmm5
1069; SSE-NEXT:    movaps %xmm3, %xmm1
1070; SSE-NEXT:    subps %xmm5, %xmm1
1071; SSE-NEXT:    mulps %xmm7, %xmm1
1072; SSE-NEXT:    addps %xmm7, %xmm1
1073; SSE-NEXT:    rcpps %xmm2, %xmm5
1074; SSE-NEXT:    movaps %xmm2, %xmm6
1075; SSE-NEXT:    mulps %xmm5, %xmm6
1076; SSE-NEXT:    movaps %xmm3, %xmm7
1077; SSE-NEXT:    subps %xmm6, %xmm7
1078; SSE-NEXT:    mulps %xmm5, %xmm7
1079; SSE-NEXT:    addps %xmm5, %xmm7
1080; SSE-NEXT:    mulps %xmm7, %xmm2
1081; SSE-NEXT:    movaps %xmm3, %xmm5
1082; SSE-NEXT:    subps %xmm2, %xmm5
1083; SSE-NEXT:    mulps %xmm7, %xmm5
1084; SSE-NEXT:    addps %xmm7, %xmm5
1085; SSE-NEXT:    rcpps %xmm4, %xmm2
1086; SSE-NEXT:    movaps %xmm4, %xmm6
1087; SSE-NEXT:    mulps %xmm2, %xmm6
1088; SSE-NEXT:    movaps %xmm3, %xmm7
1089; SSE-NEXT:    subps %xmm6, %xmm7
1090; SSE-NEXT:    mulps %xmm2, %xmm7
1091; SSE-NEXT:    addps %xmm2, %xmm7
1092; SSE-NEXT:    mulps %xmm7, %xmm4
1093; SSE-NEXT:    subps %xmm4, %xmm3
1094; SSE-NEXT:    mulps %xmm7, %xmm3
1095; SSE-NEXT:    addps %xmm7, %xmm3
1096; SSE-NEXT:    movaps %xmm5, %xmm2
1097; SSE-NEXT:    retq
1098;
1099; AVX-RECIP-LABEL: v16f32_two_step:
1100; AVX-RECIP:       # %bb.0:
1101; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1102; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1103; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1104; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1105; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1106; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1107; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1108; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1109; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1110; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1111; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1112; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1113; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1114; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1115; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1116; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1117; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm4, %ymm1
1118; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1119; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1120; AVX-RECIP-NEXT:    retq
1121;
1122; FMA-RECIP-LABEL: v16f32_two_step:
1123; FMA-RECIP:       # %bb.0:
1124; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1125; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1126; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
1127; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
1128; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1129; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
1130; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
1131; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1132; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
1133; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
1134; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1135; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
1136; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
1137; FMA-RECIP-NEXT:    retq
1138;
1139; BTVER2-LABEL: v16f32_two_step:
1140; BTVER2:       # %bb.0:
1141; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
1142; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
1143; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
1144; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
1145; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
1146; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
1147; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
1148; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
1149; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
1150; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
1151; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
1152; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
1153; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
1154; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
1155; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
1156; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
1157; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
1158; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1159; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
1160; BTVER2-NEXT:    retq # sched: [4:1.00]
1161;
1162; SANDY-LABEL: v16f32_two_step:
1163; SANDY:       # %bb.0:
1164; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
1165; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
1166; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
1167; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1168; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
1169; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1170; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
1171; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
1172; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
1173; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1174; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
1175; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
1176; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1177; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
1178; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1179; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
1180; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1181; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
1182; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1183; SANDY-NEXT:    retq # sched: [1:1.00]
1184;
1185; HASWELL-LABEL: v16f32_two_step:
1186; HASWELL:       # %bb.0:
1187; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1188; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1189; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
1190; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
1191; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
1192; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
1193; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
1194; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1195; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
1196; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
1197; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
1198; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
1199; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
1200; HASWELL-NEXT:    retq # sched: [7:1.00]
1201;
1202; HASWELL-NO-FMA-LABEL: v16f32_two_step:
1203; HASWELL-NO-FMA:       # %bb.0:
1204; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
1205; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1206; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
1207; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1208; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1209; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1210; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1211; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1212; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1213; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1214; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2
1215; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1216; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1217; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1218; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1219; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1220; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1
1221; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1222; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1223; HASWELL-NO-FMA-NEXT:    retq
1224;
1225; KNL-LABEL: v16f32_two_step:
1226; KNL:       # %bb.0:
1227; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1228; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1229; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
1230; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
1231; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
1232; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
1233; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
1234; KNL-NEXT:    retq # sched: [7:1.00]
1235;
1236; SKX-LABEL: v16f32_two_step:
1237; SKX:       # %bb.0:
1238; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1239; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1240; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
1241; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
1242; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
1243; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50]
1244; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50]
1245; SKX-NEXT:    retq # sched: [7:1.00]
1246  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1247  ret <16 x float> %div
1248}
1249
1250attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
1251attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
1252attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
1253
1254