• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule       | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -print-schedule         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -print-schedule         | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
11
12; It's the extra tests coverage for recip as discussed on D26855.
13
14define float @f32_no_step_2(float %x) #3 {
15; SSE-LABEL: f32_no_step_2:
16; SSE:       # %bb.0:
17; SSE-NEXT:    rcpss %xmm0, %xmm0
18; SSE-NEXT:    mulss {{.*}}(%rip), %xmm0
19; SSE-NEXT:    retq
20;
21; AVX-RECIP-LABEL: f32_no_step_2:
22; AVX-RECIP:       # %bb.0:
23; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
24; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
25; AVX-RECIP-NEXT:    retq
26;
27; FMA-RECIP-LABEL: f32_no_step_2:
28; FMA-RECIP:       # %bb.0:
29; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
30; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
31; FMA-RECIP-NEXT:    retq
32;
33; BTVER2-LABEL: f32_no_step_2:
34; BTVER2:       # %bb.0:
35; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
36; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
37; BTVER2-NEXT:    retq # sched: [4:1.00]
38;
39; SANDY-LABEL: f32_no_step_2:
40; SANDY:       # %bb.0:
41; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
42; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
43; SANDY-NEXT:    retq # sched: [1:1.00]
44;
45; HASWELL-LABEL: f32_no_step_2:
46; HASWELL:       # %bb.0:
47; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
48; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
49; HASWELL-NEXT:    retq # sched: [7:1.00]
50;
51; HASWELL-NO-FMA-LABEL: f32_no_step_2:
52; HASWELL-NO-FMA:       # %bb.0:
53; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
54; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
55; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
56;
57; KNL-LABEL: f32_no_step_2:
58; KNL:       # %bb.0:
59; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
60; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
61; KNL-NEXT:    retq # sched: [7:1.00]
62;
63; SKX-LABEL: f32_no_step_2:
64; SKX:       # %bb.0:
65; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
66; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
67; SKX-NEXT:    retq # sched: [7:1.00]
68  %div = fdiv fast float 1234.0, %x
69  ret float %div
70}
71
72define float @f32_one_step_2(float %x) #1 {
73; SSE-LABEL: f32_one_step_2:
74; SSE:       # %bb.0:
75; SSE-NEXT:    rcpss %xmm0, %xmm2
76; SSE-NEXT:    mulss %xmm2, %xmm0
77; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
78; SSE-NEXT:    subss %xmm0, %xmm1
79; SSE-NEXT:    mulss %xmm2, %xmm1
80; SSE-NEXT:    addss %xmm2, %xmm1
81; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
82; SSE-NEXT:    movaps %xmm1, %xmm0
83; SSE-NEXT:    retq
84;
85; AVX-RECIP-LABEL: f32_one_step_2:
86; AVX-RECIP:       # %bb.0:
87; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
88; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
89; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
90; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
91; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
92; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
93; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
94; AVX-RECIP-NEXT:    retq
95;
96; FMA-RECIP-LABEL: f32_one_step_2:
97; FMA-RECIP:       # %bb.0:
98; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
99; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
100; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
101; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
102; FMA-RECIP-NEXT:    retq
103;
104; BTVER2-LABEL: f32_one_step_2:
105; BTVER2:       # %bb.0:
106; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
107; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
108; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
109; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
110; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
111; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
112; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
113; BTVER2-NEXT:    retq # sched: [4:1.00]
114;
115; SANDY-LABEL: f32_one_step_2:
116; SANDY:       # %bb.0:
117; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
118; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
119; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
120; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
121; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
122; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
123; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
124; SANDY-NEXT:    retq # sched: [1:1.00]
125;
126; HASWELL-LABEL: f32_one_step_2:
127; HASWELL:       # %bb.0:
128; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
129; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
130; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
131; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
132; HASWELL-NEXT:    retq # sched: [7:1.00]
133;
134; HASWELL-NO-FMA-LABEL: f32_one_step_2:
135; HASWELL-NO-FMA:       # %bb.0:
136; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
137; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
138; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
139; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
140; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
141; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
142; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
143; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
144;
145; KNL-LABEL: f32_one_step_2:
146; KNL:       # %bb.0:
147; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
148; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
149; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
150; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
151; KNL-NEXT:    retq # sched: [7:1.00]
152;
153; SKX-LABEL: f32_one_step_2:
154; SKX:       # %bb.0:
155; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
156; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
157; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
158; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
159; SKX-NEXT:    retq # sched: [7:1.00]
160  %div = fdiv fast float 3456.0, %x
161  ret float %div
162}
163
164define float @f32_one_step_2_divs(float %x) #1 {
165; SSE-LABEL: f32_one_step_2_divs:
166; SSE:       # %bb.0:
167; SSE-NEXT:    rcpss %xmm0, %xmm1
168; SSE-NEXT:    mulss %xmm1, %xmm0
169; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
170; SSE-NEXT:    subss %xmm0, %xmm2
171; SSE-NEXT:    mulss %xmm1, %xmm2
172; SSE-NEXT:    addss %xmm1, %xmm2
173; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
174; SSE-NEXT:    mulss %xmm2, %xmm0
175; SSE-NEXT:    mulss %xmm2, %xmm0
176; SSE-NEXT:    retq
177;
178; AVX-RECIP-LABEL: f32_one_step_2_divs:
179; AVX-RECIP:       # %bb.0:
180; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
181; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
182; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
183; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
184; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
185; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
186; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
187; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
188; AVX-RECIP-NEXT:    retq
189;
190; FMA-RECIP-LABEL: f32_one_step_2_divs:
191; FMA-RECIP:       # %bb.0:
192; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
193; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
194; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
195; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
196; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
197; FMA-RECIP-NEXT:    retq
198;
199; BTVER2-LABEL: f32_one_step_2_divs:
200; BTVER2:       # %bb.0:
201; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
202; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
203; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
204; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
205; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
206; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
207; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
208; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
209; BTVER2-NEXT:    retq # sched: [4:1.00]
210;
211; SANDY-LABEL: f32_one_step_2_divs:
212; SANDY:       # %bb.0:
213; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
214; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
215; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
216; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
217; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
218; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
219; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
220; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
221; SANDY-NEXT:    retq # sched: [1:1.00]
222;
223; HASWELL-LABEL: f32_one_step_2_divs:
224; HASWELL:       # %bb.0:
225; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
226; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
227; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
228; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
229; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
230; HASWELL-NEXT:    retq # sched: [7:1.00]
231;
232; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
233; HASWELL-NO-FMA:       # %bb.0:
234; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
235; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
236; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
237; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
238; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
239; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
240; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
241; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
242; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
243;
244; KNL-LABEL: f32_one_step_2_divs:
245; KNL:       # %bb.0:
246; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
247; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
248; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
249; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
250; KNL-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
251; KNL-NEXT:    retq # sched: [7:1.00]
252;
253; SKX-LABEL: f32_one_step_2_divs:
254; SKX:       # %bb.0:
255; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
256; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [9:0.50]
257; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
258; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
259; SKX-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
260; SKX-NEXT:    retq # sched: [7:1.00]
261  %div = fdiv fast float 3456.0, %x
262  %div2 = fdiv fast float %div, %x
263  ret float %div2
264}
265
266define float @f32_two_step_2(float %x) #2 {
267; SSE-LABEL: f32_two_step_2:
268; SSE:       # %bb.0:
269; SSE-NEXT:    rcpss %xmm0, %xmm2
270; SSE-NEXT:    movaps %xmm0, %xmm3
271; SSE-NEXT:    mulss %xmm2, %xmm3
272; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
273; SSE-NEXT:    movaps %xmm1, %xmm4
274; SSE-NEXT:    subss %xmm3, %xmm4
275; SSE-NEXT:    mulss %xmm2, %xmm4
276; SSE-NEXT:    addss %xmm2, %xmm4
277; SSE-NEXT:    mulss %xmm4, %xmm0
278; SSE-NEXT:    subss %xmm0, %xmm1
279; SSE-NEXT:    mulss %xmm4, %xmm1
280; SSE-NEXT:    addss %xmm4, %xmm1
281; SSE-NEXT:    mulss {{.*}}(%rip), %xmm1
282; SSE-NEXT:    movaps %xmm1, %xmm0
283; SSE-NEXT:    retq
284;
285; AVX-RECIP-LABEL: f32_two_step_2:
286; AVX-RECIP:       # %bb.0:
287; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
288; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
289; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
290; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
291; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
292; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
293; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
294; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
295; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
296; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
297; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
298; AVX-RECIP-NEXT:    retq
299;
300; FMA-RECIP-LABEL: f32_two_step_2:
301; FMA-RECIP:       # %bb.0:
302; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
303; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
304; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
305; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
306; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
307; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
308; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
309; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
310; FMA-RECIP-NEXT:    retq
311;
312; BTVER2-LABEL: f32_two_step_2:
313; BTVER2:       # %bb.0:
314; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
315; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
316; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
317; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
318; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
319; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
320; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
321; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
322; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
323; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
324; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
325; BTVER2-NEXT:    retq # sched: [4:1.00]
326;
327; SANDY-LABEL: f32_two_step_2:
328; SANDY:       # %bb.0:
329; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
330; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
331; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
332; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
333; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
334; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
335; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
336; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
337; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
338; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
339; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
340; SANDY-NEXT:    retq # sched: [1:1.00]
341;
342; HASWELL-LABEL: f32_two_step_2:
343; HASWELL:       # %bb.0:
344; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
345; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
346; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
347; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
348; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
349; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
350; HASWELL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
351; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
352; HASWELL-NEXT:    retq # sched: [7:1.00]
353;
354; HASWELL-NO-FMA-LABEL: f32_two_step_2:
355; HASWELL-NO-FMA:       # %bb.0:
356; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
357; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
358; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:0.50]
359; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
360; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
361; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
362; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
363; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
364; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
365; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
366; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
367; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
368;
369; KNL-LABEL: f32_two_step_2:
370; KNL:       # %bb.0:
371; KNL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
372; KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
373; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
374; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
375; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
376; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
377; KNL-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
378; KNL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
379; KNL-NEXT:    retq # sched: [7:1.00]
380;
381; SKX-LABEL: f32_two_step_2:
382; SKX:       # %bb.0:
383; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
384; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
385; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
386; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
387; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
388; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
389; SKX-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
390; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
391; SKX-NEXT:    retq # sched: [7:1.00]
392  %div = fdiv fast float 6789.0, %x
393  ret float %div
394}
395
396define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
397; SSE-LABEL: v4f32_one_step2:
398; SSE:       # %bb.0:
399; SSE-NEXT:    rcpps %xmm0, %xmm2
400; SSE-NEXT:    mulps %xmm2, %xmm0
401; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
402; SSE-NEXT:    subps %xmm0, %xmm1
403; SSE-NEXT:    mulps %xmm2, %xmm1
404; SSE-NEXT:    addps %xmm2, %xmm1
405; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
406; SSE-NEXT:    movaps %xmm1, %xmm0
407; SSE-NEXT:    retq
408;
409; AVX-RECIP-LABEL: v4f32_one_step2:
410; AVX-RECIP:       # %bb.0:
411; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
412; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
413; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
414; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
415; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
416; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
417; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
418; AVX-RECIP-NEXT:    retq
419;
420; FMA-RECIP-LABEL: v4f32_one_step2:
421; FMA-RECIP:       # %bb.0:
422; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
423; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
424; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
425; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
426; FMA-RECIP-NEXT:    retq
427;
428; BTVER2-LABEL: v4f32_one_step2:
429; BTVER2:       # %bb.0:
430; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
431; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
432; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
433; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
434; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
435; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
436; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
437; BTVER2-NEXT:    retq # sched: [4:1.00]
438;
439; SANDY-LABEL: v4f32_one_step2:
440; SANDY:       # %bb.0:
441; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
442; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
443; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
444; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
445; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
446; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
447; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
448; SANDY-NEXT:    retq # sched: [1:1.00]
449;
450; HASWELL-LABEL: v4f32_one_step2:
451; HASWELL:       # %bb.0:
452; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
453; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
454; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
455; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
456; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
457; HASWELL-NEXT:    retq # sched: [7:1.00]
458;
459; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
460; HASWELL-NO-FMA:       # %bb.0:
461; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
462; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
463; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
464; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
465; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
466; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
467; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
468; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
469;
470; KNL-LABEL: v4f32_one_step2:
471; KNL:       # %bb.0:
472; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
473; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
474; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
475; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
476; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
477; KNL-NEXT:    retq # sched: [7:1.00]
478;
479; SKX-LABEL: v4f32_one_step2:
480; SKX:       # %bb.0:
481; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
482; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
483; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
484; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
485; SKX-NEXT:    retq # sched: [7:1.00]
486  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
487  ret <4 x float> %div
488}
489
490define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
491; SSE-LABEL: v4f32_one_step_2_divs:
492; SSE:       # %bb.0:
493; SSE-NEXT:    rcpps %xmm0, %xmm1
494; SSE-NEXT:    mulps %xmm1, %xmm0
495; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
496; SSE-NEXT:    subps %xmm0, %xmm2
497; SSE-NEXT:    mulps %xmm1, %xmm2
498; SSE-NEXT:    addps %xmm1, %xmm2
499; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
500; SSE-NEXT:    mulps %xmm2, %xmm0
501; SSE-NEXT:    mulps %xmm2, %xmm0
502; SSE-NEXT:    retq
503;
504; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
505; AVX-RECIP:       # %bb.0:
506; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
507; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
508; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
509; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
510; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
511; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
512; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
513; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
514; AVX-RECIP-NEXT:    retq
515;
516; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
517; FMA-RECIP:       # %bb.0:
518; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
519; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem
520; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
521; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
522; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
523; FMA-RECIP-NEXT:    retq
524;
525; BTVER2-LABEL: v4f32_one_step_2_divs:
526; BTVER2:       # %bb.0:
527; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
528; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
529; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
530; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
531; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
532; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
533; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [7:1.00]
534; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
535; BTVER2-NEXT:    retq # sched: [4:1.00]
536;
537; SANDY-LABEL: v4f32_one_step_2_divs:
538; SANDY:       # %bb.0:
539; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
540; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
541; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
542; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
543; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
544; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
545; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
546; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
547; SANDY-NEXT:    retq # sched: [1:1.00]
548;
549; HASWELL-LABEL: v4f32_one_step_2_divs:
550; HASWELL:       # %bb.0:
551; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
552; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
553; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
554; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
555; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
556; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
557; HASWELL-NEXT:    retq # sched: [7:1.00]
558;
559; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
560; HASWELL-NO-FMA:       # %bb.0:
561; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
562; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
563; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
564; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
565; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
566; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
567; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
568; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
569; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
570;
571; KNL-LABEL: v4f32_one_step_2_divs:
572; KNL:       # %bb.0:
573; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
574; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
575; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
576; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
577; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
578; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
579; KNL-NEXT:    retq # sched: [7:1.00]
580;
581; SKX-LABEL: v4f32_one_step_2_divs:
582; SKX:       # %bb.0:
583; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
584; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
585; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [4:0.50]
586; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
587; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
588; SKX-NEXT:    retq # sched: [7:1.00]
589  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
590  %div2 = fdiv fast <4 x float> %div, %x
591  ret <4 x float> %div2
592}
593
594define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
595; SSE-LABEL: v4f32_two_step2:
596; SSE:       # %bb.0:
597; SSE-NEXT:    rcpps %xmm0, %xmm2
598; SSE-NEXT:    movaps %xmm0, %xmm3
599; SSE-NEXT:    mulps %xmm2, %xmm3
600; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
601; SSE-NEXT:    movaps %xmm1, %xmm4
602; SSE-NEXT:    subps %xmm3, %xmm4
603; SSE-NEXT:    mulps %xmm2, %xmm4
604; SSE-NEXT:    addps %xmm2, %xmm4
605; SSE-NEXT:    mulps %xmm4, %xmm0
606; SSE-NEXT:    subps %xmm0, %xmm1
607; SSE-NEXT:    mulps %xmm4, %xmm1
608; SSE-NEXT:    addps %xmm4, %xmm1
609; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
610; SSE-NEXT:    movaps %xmm1, %xmm0
611; SSE-NEXT:    retq
612;
613; AVX-RECIP-LABEL: v4f32_two_step2:
614; AVX-RECIP:       # %bb.0:
615; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
616; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
617; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
618; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
619; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
620; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
621; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
622; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
623; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
624; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
625; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
626; AVX-RECIP-NEXT:    retq
627;
628; FMA-RECIP-LABEL: v4f32_two_step2:
629; FMA-RECIP:       # %bb.0:
630; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
631; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
632; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
633; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
634; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
635; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2
636; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
637; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
638; FMA-RECIP-NEXT:    retq
639;
640; BTVER2-LABEL: v4f32_two_step2:
641; BTVER2:       # %bb.0:
642; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
643; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
644; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
645; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
646; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [2:1.00]
647; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
648; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
649; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
650; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [2:1.00]
651; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
652; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
653; BTVER2-NEXT:    retq # sched: [4:1.00]
654;
655; SANDY-LABEL: v4f32_two_step2:
656; SANDY:       # %bb.0:
657; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
658; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
659; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
660; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
661; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
662; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
663; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
664; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
665; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
666; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
667; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
668; SANDY-NEXT:    retq # sched: [1:1.00]
669;
670; HASWELL-LABEL: v4f32_two_step2:
671; HASWELL:       # %bb.0:
672; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
673; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
674; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
675; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
676; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
677; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
678; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
679; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
680; HASWELL-NEXT:    retq # sched: [7:1.00]
681;
682; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
683; HASWELL-NO-FMA:       # %bb.0:
684; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
685; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
686; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
687; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
688; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
689; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
690; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
691; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
692; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
693; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
694; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
695; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
696;
697; KNL-LABEL: v4f32_two_step2:
698; KNL:       # %bb.0:
699; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
700; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
701; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
702; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
703; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
704; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [5:0.50]
705; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [5:0.50]
706; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
707; KNL-NEXT:    retq # sched: [7:1.00]
708;
709; SKX-LABEL: v4f32_two_step2:
710; SKX:       # %bb.0:
711; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
712; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
713; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
714; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
715; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
716; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm2 sched: [4:0.50]
717; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3 sched: [4:0.50]
718; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
719; SKX-NEXT:    retq # sched: [7:1.00]
720  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
721  ret <4 x float> %div
722}
723
724define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
725; SSE-LABEL: v8f32_one_step2:
726; SSE:       # %bb.0:
727; SSE-NEXT:    rcpps %xmm1, %xmm4
728; SSE-NEXT:    mulps %xmm4, %xmm1
729; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
730; SSE-NEXT:    movaps %xmm2, %xmm3
731; SSE-NEXT:    subps %xmm1, %xmm3
732; SSE-NEXT:    mulps %xmm4, %xmm3
733; SSE-NEXT:    addps %xmm4, %xmm3
734; SSE-NEXT:    rcpps %xmm0, %xmm1
735; SSE-NEXT:    mulps %xmm1, %xmm0
736; SSE-NEXT:    subps %xmm0, %xmm2
737; SSE-NEXT:    mulps %xmm1, %xmm2
738; SSE-NEXT:    addps %xmm1, %xmm2
739; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
740; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
741; SSE-NEXT:    movaps %xmm2, %xmm0
742; SSE-NEXT:    movaps %xmm3, %xmm1
743; SSE-NEXT:    retq
744;
745; AVX-RECIP-LABEL: v8f32_one_step2:
746; AVX-RECIP:       # %bb.0:
747; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
748; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
749; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
750; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
751; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
752; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
753; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
754; AVX-RECIP-NEXT:    retq
755;
756; FMA-RECIP-LABEL: v8f32_one_step2:
757; FMA-RECIP:       # %bb.0:
758; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
759; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
760; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
761; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
762; FMA-RECIP-NEXT:    retq
763;
764; BTVER2-LABEL: v8f32_one_step2:
765; BTVER2:       # %bb.0:
766; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
767; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
768; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
769; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
770; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
771; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
772; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
773; BTVER2-NEXT:    retq # sched: [4:1.00]
774;
775; SANDY-LABEL: v8f32_one_step2:
776; SANDY:       # %bb.0:
777; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
778; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
779; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
780; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
781; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
782; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
783; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
784; SANDY-NEXT:    retq # sched: [1:1.00]
785;
786; HASWELL-LABEL: v8f32_one_step2:
787; HASWELL:       # %bb.0:
788; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
789; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
790; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
791; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
792; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
793; HASWELL-NEXT:    retq # sched: [7:1.00]
794;
795; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
796; HASWELL-NO-FMA:       # %bb.0:
797; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
798; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
799; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
800; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
801; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
802; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
803; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
804; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
805;
806; KNL-LABEL: v8f32_one_step2:
807; KNL:       # %bb.0:
808; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
809; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
810; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
811; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
812; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
813; KNL-NEXT:    retq # sched: [7:1.00]
814;
815; SKX-LABEL: v8f32_one_step2:
816; SKX:       # %bb.0:
817; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
818; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
819; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
820; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
821; SKX-NEXT:    retq # sched: [7:1.00]
822  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
823  ret <8 x float> %div
824}
825
826define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
827; SSE-LABEL: v8f32_one_step_2_divs:
828; SSE:       # %bb.0:
829; SSE-NEXT:    rcpps %xmm0, %xmm2
830; SSE-NEXT:    mulps %xmm2, %xmm0
831; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
832; SSE-NEXT:    movaps %xmm3, %xmm4
833; SSE-NEXT:    subps %xmm0, %xmm4
834; SSE-NEXT:    mulps %xmm2, %xmm4
835; SSE-NEXT:    addps %xmm2, %xmm4
836; SSE-NEXT:    rcpps %xmm1, %xmm0
837; SSE-NEXT:    mulps %xmm0, %xmm1
838; SSE-NEXT:    subps %xmm1, %xmm3
839; SSE-NEXT:    mulps %xmm0, %xmm3
840; SSE-NEXT:    addps %xmm0, %xmm3
841; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
842; SSE-NEXT:    mulps %xmm3, %xmm1
843; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
844; SSE-NEXT:    mulps %xmm4, %xmm0
845; SSE-NEXT:    mulps %xmm4, %xmm0
846; SSE-NEXT:    mulps %xmm3, %xmm1
847; SSE-NEXT:    retq
848;
849; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
850; AVX-RECIP:       # %bb.0:
851; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
852; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
853; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
854; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
855; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
856; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
857; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
858; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
859; AVX-RECIP-NEXT:    retq
860;
861; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
862; FMA-RECIP:       # %bb.0:
863; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
864; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem
865; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
866; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
867; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
868; FMA-RECIP-NEXT:    retq
869;
870; BTVER2-LABEL: v8f32_one_step_2_divs:
871; BTVER2:       # %bb.0:
872; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
873; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
874; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
875; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
876; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
877; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
878; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [7:2.00]
879; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
880; BTVER2-NEXT:    retq # sched: [4:1.00]
881;
882; SANDY-LABEL: v8f32_one_step_2_divs:
883; SANDY:       # %bb.0:
884; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
885; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
886; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
887; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
888; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
889; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
890; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
891; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
892; SANDY-NEXT:    retq # sched: [1:1.00]
893;
894; HASWELL-LABEL: v8f32_one_step_2_divs:
895; HASWELL:       # %bb.0:
896; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
897; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
898; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
899; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
900; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
901; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
902; HASWELL-NEXT:    retq # sched: [7:1.00]
903;
904; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
905; HASWELL-NO-FMA:       # %bb.0:
906; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
907; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
908; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
909; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
910; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
911; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
912; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
913; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
914; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
915;
916; KNL-LABEL: v8f32_one_step_2_divs:
917; KNL:       # %bb.0:
918; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
919; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
920; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
921; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
922; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
923; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
924; KNL-NEXT:    retq # sched: [7:1.00]
925;
926; SKX-LABEL: v8f32_one_step_2_divs:
927; SKX:       # %bb.0:
928; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
929; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [11:0.50]
930; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [4:0.50]
931; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
932; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
933; SKX-NEXT:    retq # sched: [7:1.00]
934  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
935  %div2 = fdiv fast <8 x float> %div, %x
936  ret <8 x float> %div2
937}
938
939define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
940; SSE-LABEL: v8f32_two_step2:
941; SSE:       # %bb.0:
942; SSE-NEXT:    movaps %xmm0, %xmm2
943; SSE-NEXT:    rcpps %xmm1, %xmm3
944; SSE-NEXT:    movaps %xmm1, %xmm4
945; SSE-NEXT:    mulps %xmm3, %xmm4
946; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
947; SSE-NEXT:    movaps %xmm0, %xmm5
948; SSE-NEXT:    subps %xmm4, %xmm5
949; SSE-NEXT:    mulps %xmm3, %xmm5
950; SSE-NEXT:    addps %xmm3, %xmm5
951; SSE-NEXT:    mulps %xmm5, %xmm1
952; SSE-NEXT:    movaps %xmm0, %xmm3
953; SSE-NEXT:    subps %xmm1, %xmm3
954; SSE-NEXT:    mulps %xmm5, %xmm3
955; SSE-NEXT:    addps %xmm5, %xmm3
956; SSE-NEXT:    rcpps %xmm2, %xmm1
957; SSE-NEXT:    movaps %xmm2, %xmm4
958; SSE-NEXT:    mulps %xmm1, %xmm4
959; SSE-NEXT:    movaps %xmm0, %xmm5
960; SSE-NEXT:    subps %xmm4, %xmm5
961; SSE-NEXT:    mulps %xmm1, %xmm5
962; SSE-NEXT:    addps %xmm1, %xmm5
963; SSE-NEXT:    mulps %xmm5, %xmm2
964; SSE-NEXT:    subps %xmm2, %xmm0
965; SSE-NEXT:    mulps %xmm5, %xmm0
966; SSE-NEXT:    addps %xmm5, %xmm0
967; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
968; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
969; SSE-NEXT:    movaps %xmm3, %xmm1
970; SSE-NEXT:    retq
971;
972; AVX-RECIP-LABEL: v8f32_two_step2:
973; AVX-RECIP:       # %bb.0:
974; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
975; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
976; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
977; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
978; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
979; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
980; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
981; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
982; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
983; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
984; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
985; AVX-RECIP-NEXT:    retq
986;
987; FMA-RECIP-LABEL: v8f32_two_step2:
988; FMA-RECIP:       # %bb.0:
989; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
990; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
991; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
992; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
993; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
994; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2
995; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
996; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
997; FMA-RECIP-NEXT:    retq
998;
999; BTVER2-LABEL: v8f32_two_step2:
1000; BTVER2:       # %bb.0:
1001; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
1002; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
1003; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
1004; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
1005; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [2:2.00]
1006; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:2.00]
1007; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
1008; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
1009; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [2:2.00]
1010; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
1011; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1012; BTVER2-NEXT:    retq # sched: [4:1.00]
1013;
1014; SANDY-LABEL: v8f32_two_step2:
1015; SANDY:       # %bb.0:
1016; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
1017; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
1018; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
1019; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
1020; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
1021; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
1022; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
1023; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1024; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
1025; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
1026; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1027; SANDY-NEXT:    retq # sched: [1:1.00]
1028;
1029; HASWELL-LABEL: v8f32_two_step2:
1030; HASWELL:       # %bb.0:
1031; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
1032; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1033; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
1034; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
1035; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
1036; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
1037; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
1038; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1039; HASWELL-NEXT:    retq # sched: [7:1.00]
1040;
1041; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
1042; HASWELL-NO-FMA:       # %bb.0:
1043; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
1044; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
1045; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1046; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
1047; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
1048; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
1049; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
1050; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1051; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
1052; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
1053; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1054; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1055;
1056; KNL-LABEL: v8f32_two_step2:
1057; KNL:       # %bb.0:
1058; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
1059; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1060; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
1061; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
1062; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
1063; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [5:0.50]
1064; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [5:0.50]
1065; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1066; KNL-NEXT:    retq # sched: [7:1.00]
1067;
1068; SKX-LABEL: v8f32_two_step2:
1069; SKX:       # %bb.0:
1070; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
1071; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1072; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
1073; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
1074; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
1075; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm2 sched: [4:0.50]
1076; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3 sched: [4:0.50]
1077; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
1078; SKX-NEXT:    retq # sched: [7:1.00]
1079  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1080  ret <8 x float> %div
1081}
1082
1083define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
1084; SSE-LABEL: v8f32_no_step:
1085; SSE:       # %bb.0:
1086; SSE-NEXT:    rcpps %xmm0, %xmm0
1087; SSE-NEXT:    rcpps %xmm1, %xmm1
1088; SSE-NEXT:    retq
1089;
1090; AVX-RECIP-LABEL: v8f32_no_step:
1091; AVX-RECIP:       # %bb.0:
1092; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1093; AVX-RECIP-NEXT:    retq
1094;
1095; FMA-RECIP-LABEL: v8f32_no_step:
1096; FMA-RECIP:       # %bb.0:
1097; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1098; FMA-RECIP-NEXT:    retq
1099;
1100; BTVER2-LABEL: v8f32_no_step:
1101; BTVER2:       # %bb.0:
1102; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1103; BTVER2-NEXT:    retq # sched: [4:1.00]
1104;
1105; SANDY-LABEL: v8f32_no_step:
1106; SANDY:       # %bb.0:
1107; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1108; SANDY-NEXT:    retq # sched: [1:1.00]
1109;
1110; HASWELL-LABEL: v8f32_no_step:
1111; HASWELL:       # %bb.0:
1112; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1113; HASWELL-NEXT:    retq # sched: [7:1.00]
1114;
1115; HASWELL-NO-FMA-LABEL: v8f32_no_step:
1116; HASWELL-NO-FMA:       # %bb.0:
1117; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1118; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1119;
1120; KNL-LABEL: v8f32_no_step:
1121; KNL:       # %bb.0:
1122; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1123; KNL-NEXT:    retq # sched: [7:1.00]
1124;
1125; SKX-LABEL: v8f32_no_step:
1126; SKX:       # %bb.0:
1127; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
1128; SKX-NEXT:    retq # sched: [7:1.00]
1129  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1130  ret <8 x float> %div
1131}
1132
1133define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
1134; SSE-LABEL: v8f32_no_step2:
1135; SSE:       # %bb.0:
1136; SSE-NEXT:    rcpps %xmm1, %xmm1
1137; SSE-NEXT:    rcpps %xmm0, %xmm0
1138; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
1139; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
1140; SSE-NEXT:    retq
1141;
1142; AVX-RECIP-LABEL: v8f32_no_step2:
1143; AVX-RECIP:       # %bb.0:
1144; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1145; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1146; AVX-RECIP-NEXT:    retq
1147;
1148; FMA-RECIP-LABEL: v8f32_no_step2:
1149; FMA-RECIP:       # %bb.0:
1150; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1151; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1152; FMA-RECIP-NEXT:    retq
1153;
1154; BTVER2-LABEL: v8f32_no_step2:
1155; BTVER2:       # %bb.0:
1156; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1157; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1158; BTVER2-NEXT:    retq # sched: [4:1.00]
1159;
1160; SANDY-LABEL: v8f32_no_step2:
1161; SANDY:       # %bb.0:
1162; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1163; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1164; SANDY-NEXT:    retq # sched: [1:1.00]
1165;
1166; HASWELL-LABEL: v8f32_no_step2:
1167; HASWELL:       # %bb.0:
1168; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1169; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1170; HASWELL-NEXT:    retq # sched: [7:1.00]
1171;
1172; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
1173; HASWELL-NO-FMA:       # %bb.0:
1174; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1175; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1176; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1177;
1178; KNL-LABEL: v8f32_no_step2:
1179; KNL:       # %bb.0:
1180; KNL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1181; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1182; KNL-NEXT:    retq # sched: [7:1.00]
1183;
1184; SKX-LABEL: v8f32_no_step2:
1185; SKX:       # %bb.0:
1186; SKX-NEXT:    vrcpps %ymm0, %ymm0 # sched: [4:1.00]
1187; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
1188; SKX-NEXT:    retq # sched: [7:1.00]
1189  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1190  ret <8 x float> %div
1191}
1192
1193define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
1194; SSE-LABEL: v16f32_one_step2:
1195; SSE:       # %bb.0:
1196; SSE-NEXT:    movaps %xmm3, %xmm4
1197; SSE-NEXT:    movaps %xmm2, %xmm5
1198; SSE-NEXT:    movaps %xmm0, %xmm6
1199; SSE-NEXT:    rcpps %xmm3, %xmm2
1200; SSE-NEXT:    mulps %xmm2, %xmm4
1201; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1202; SSE-NEXT:    movaps %xmm0, %xmm3
1203; SSE-NEXT:    subps %xmm4, %xmm3
1204; SSE-NEXT:    mulps %xmm2, %xmm3
1205; SSE-NEXT:    addps %xmm2, %xmm3
1206; SSE-NEXT:    rcpps %xmm5, %xmm4
1207; SSE-NEXT:    mulps %xmm4, %xmm5
1208; SSE-NEXT:    movaps %xmm0, %xmm2
1209; SSE-NEXT:    subps %xmm5, %xmm2
1210; SSE-NEXT:    mulps %xmm4, %xmm2
1211; SSE-NEXT:    addps %xmm4, %xmm2
1212; SSE-NEXT:    rcpps %xmm1, %xmm5
1213; SSE-NEXT:    mulps %xmm5, %xmm1
1214; SSE-NEXT:    movaps %xmm0, %xmm4
1215; SSE-NEXT:    subps %xmm1, %xmm4
1216; SSE-NEXT:    mulps %xmm5, %xmm4
1217; SSE-NEXT:    addps %xmm5, %xmm4
1218; SSE-NEXT:    rcpps %xmm6, %xmm1
1219; SSE-NEXT:    mulps %xmm1, %xmm6
1220; SSE-NEXT:    subps %xmm6, %xmm0
1221; SSE-NEXT:    mulps %xmm1, %xmm0
1222; SSE-NEXT:    addps %xmm1, %xmm0
1223; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
1224; SSE-NEXT:    mulps {{.*}}(%rip), %xmm4
1225; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
1226; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
1227; SSE-NEXT:    movaps %xmm4, %xmm1
1228; SSE-NEXT:    retq
1229;
1230; AVX-RECIP-LABEL: v16f32_one_step2:
1231; AVX-RECIP:       # %bb.0:
1232; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1233; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1234; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1235; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1236; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1237; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1238; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1239; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1240; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1241; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1242; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1243; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1244; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
1245; AVX-RECIP-NEXT:    retq
1246;
1247; FMA-RECIP-LABEL: v16f32_one_step2:
1248; FMA-RECIP:       # %bb.0:
1249; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1250; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1251; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
1252; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
1253; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1254; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1255; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
1256; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1257; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
1258; FMA-RECIP-NEXT:    retq
1259;
1260; BTVER2-LABEL: v16f32_one_step2:
1261; BTVER2:       # %bb.0:
1262; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
1263; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
1264; BTVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [2:2.00]
1265; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
1266; BTVER2-NEXT:    vmulps %ymm4, %ymm0, %ymm0 # sched: [2:2.00]
1267; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
1268; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
1269; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1270; BTVER2-NEXT:    vmulps %ymm0, %ymm4, %ymm0 # sched: [2:2.00]
1271; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
1272; BTVER2-NEXT:    vaddps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
1273; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1274; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
1275; BTVER2-NEXT:    retq # sched: [4:1.00]
1276;
1277; SANDY-LABEL: v16f32_one_step2:
1278; SANDY:       # %bb.0:
1279; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
1280; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
1281; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
1282; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1283; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
1284; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1285; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
1286; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
1287; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1288; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
1289; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1290; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1291; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
1292; SANDY-NEXT:    retq # sched: [1:1.00]
1293;
1294; HASWELL-LABEL: v16f32_one_step2:
1295; HASWELL:       # %bb.0:
1296; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1297; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1298; HASWELL-NEXT:    vrcpps %ymm0, %ymm4 # sched: [11:2.00]
1299; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
1300; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
1301; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
1302; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
1303; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1304; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1305; HASWELL-NEXT:    retq # sched: [7:1.00]
1306;
1307; HASWELL-NO-FMA-LABEL: v16f32_one_step2:
1308; HASWELL-NO-FMA:       # %bb.0:
1309; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1310; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
1311; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1312; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1313; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1314; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1315; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1316; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1317; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1318; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1319; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1320; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1321; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1322; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1323;
1324; KNL-LABEL: v16f32_one_step2:
1325; KNL:       # %bb.0:
1326; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1327; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1328; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1329; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
1330; KNL-NEXT:    retq # sched: [7:1.00]
1331;
1332; SKX-LABEL: v16f32_one_step2:
1333; SKX:       # %bb.0:
1334; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1335; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1336; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
1337; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
1338; SKX-NEXT:    retq # sched: [7:1.00]
1339  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1340  ret <16 x float> %div
1341}
1342
1343define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
1344; SSE-LABEL: v16f32_one_step_2_divs:
1345; SSE:       # %bb.0:
1346; SSE-NEXT:    rcpps %xmm0, %xmm6
1347; SSE-NEXT:    mulps %xmm6, %xmm0
1348; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1349; SSE-NEXT:    movaps %xmm4, %xmm5
1350; SSE-NEXT:    subps %xmm0, %xmm5
1351; SSE-NEXT:    mulps %xmm6, %xmm5
1352; SSE-NEXT:    addps %xmm6, %xmm5
1353; SSE-NEXT:    rcpps %xmm1, %xmm0
1354; SSE-NEXT:    mulps %xmm0, %xmm1
1355; SSE-NEXT:    movaps %xmm4, %xmm6
1356; SSE-NEXT:    subps %xmm1, %xmm6
1357; SSE-NEXT:    mulps %xmm0, %xmm6
1358; SSE-NEXT:    addps %xmm0, %xmm6
1359; SSE-NEXT:    rcpps %xmm2, %xmm0
1360; SSE-NEXT:    mulps %xmm0, %xmm2
1361; SSE-NEXT:    movaps %xmm4, %xmm7
1362; SSE-NEXT:    subps %xmm2, %xmm7
1363; SSE-NEXT:    mulps %xmm0, %xmm7
1364; SSE-NEXT:    addps %xmm0, %xmm7
1365; SSE-NEXT:    rcpps %xmm3, %xmm0
1366; SSE-NEXT:    mulps %xmm0, %xmm3
1367; SSE-NEXT:    subps %xmm3, %xmm4
1368; SSE-NEXT:    mulps %xmm0, %xmm4
1369; SSE-NEXT:    addps %xmm0, %xmm4
1370; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.300000e+01,1.400000e+01,1.500000e+01,1.600000e+01]
1371; SSE-NEXT:    mulps %xmm4, %xmm3
1372; SSE-NEXT:    movaps {{.*#+}} xmm2 = [9.000000e+00,1.000000e+01,1.100000e+01,1.200000e+01]
1373; SSE-NEXT:    mulps %xmm7, %xmm2
1374; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
1375; SSE-NEXT:    mulps %xmm6, %xmm1
1376; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
1377; SSE-NEXT:    mulps %xmm5, %xmm0
1378; SSE-NEXT:    mulps %xmm5, %xmm0
1379; SSE-NEXT:    mulps %xmm6, %xmm1
1380; SSE-NEXT:    mulps %xmm7, %xmm2
1381; SSE-NEXT:    mulps %xmm4, %xmm3
1382; SSE-NEXT:    retq
1383;
1384; AVX-RECIP-LABEL: v16f32_one_step_2_divs:
1385; AVX-RECIP:       # %bb.0:
1386; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1387; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1388; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1389; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1390; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1391; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1392; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1393; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1394; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1395; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1396; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1397; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
1398; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
1399; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1400; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1401; AVX-RECIP-NEXT:    retq
1402;
1403; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
1404; FMA-RECIP:       # %bb.0:
1405; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1406; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1407; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1408; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
1409; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1410; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
1411; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
1412; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2
1413; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3
1414; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1415; FMA-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1416; FMA-RECIP-NEXT:    retq
1417;
1418; BTVER2-LABEL: v16f32_one_step_2_divs:
1419; BTVER2:       # %bb.0:
1420; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
1421; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
1422; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
1423; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
1424; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
1425; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
1426; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
1427; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
1428; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:2.00]
1429; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [7:2.00]
1430; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1431; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
1432; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [7:2.00]
1433; BTVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [2:2.00]
1434; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1435; BTVER2-NEXT:    retq # sched: [4:1.00]
1436;
1437; SANDY-LABEL: v16f32_one_step_2_divs:
1438; SANDY:       # %bb.0:
1439; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
1440; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
1441; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
1442; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1443; SANDY-NEXT:    vrcpps %ymm1, %ymm4 # sched: [7:2.00]
1444; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
1445; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1446; SANDY-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [5:1.00]
1447; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1448; SANDY-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [5:1.00]
1449; SANDY-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1450; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:1.00]
1451; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:1.00]
1452; SANDY-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:1.00]
1453; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
1454; SANDY-NEXT:    retq # sched: [1:1.00]
1455;
1456; HASWELL-LABEL: v16f32_one_step_2_divs:
1457; HASWELL:       # %bb.0:
1458; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1459; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1460; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
1461; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
1462; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1463; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
1464; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
1465; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
1466; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
1467; HASWELL-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
1468; HASWELL-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1469; HASWELL-NEXT:    retq # sched: [7:1.00]
1470;
1471; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs:
1472; HASWELL-NO-FMA:       # %bb.0:
1473; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1474; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1475; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1476; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
1477; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
1478; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1479; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1480; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
1481; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
1482; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
1483; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1484; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [12:0.50]
1485; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [12:0.50]
1486; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:0.50]
1487; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1488; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1489;
1490; KNL-LABEL: v16f32_one_step_2_divs:
1491; KNL:       # %bb.0:
1492; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1493; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [12:0.50]
1494; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [5:0.50]
1495; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [12:0.50]
1496; KNL-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [5:0.50]
1497; KNL-NEXT:    retq # sched: [7:1.00]
1498;
1499; SKX-LABEL: v16f32_one_step_2_divs:
1500; SKX:       # %bb.0:
1501; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1502; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + mem sched: [11:0.50]
1503; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm1 sched: [4:0.50]
1504; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm1 # sched: [11:0.50]
1505; SKX-NEXT:    vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.50]
1506; SKX-NEXT:    retq # sched: [7:1.00]
1507  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1508  %div2 = fdiv fast <16 x float> %div, %x
1509  ret <16 x float> %div2
1510}
1511
1512define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
1513; SSE-LABEL: v16f32_two_step2:
1514; SSE:       # %bb.0:
1515; SSE-NEXT:    movaps %xmm3, %xmm6
1516; SSE-NEXT:    movaps %xmm2, %xmm5
1517; SSE-NEXT:    movaps %xmm0, %xmm4
1518; SSE-NEXT:    rcpps %xmm3, %xmm2
1519; SSE-NEXT:    mulps %xmm2, %xmm3
1520; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1521; SSE-NEXT:    movaps %xmm0, %xmm7
1522; SSE-NEXT:    subps %xmm3, %xmm7
1523; SSE-NEXT:    mulps %xmm2, %xmm7
1524; SSE-NEXT:    addps %xmm2, %xmm7
1525; SSE-NEXT:    mulps %xmm7, %xmm6
1526; SSE-NEXT:    movaps %xmm0, %xmm3
1527; SSE-NEXT:    subps %xmm6, %xmm3
1528; SSE-NEXT:    mulps %xmm7, %xmm3
1529; SSE-NEXT:    addps %xmm7, %xmm3
1530; SSE-NEXT:    rcpps %xmm5, %xmm2
1531; SSE-NEXT:    movaps %xmm5, %xmm6
1532; SSE-NEXT:    mulps %xmm2, %xmm6
1533; SSE-NEXT:    movaps %xmm0, %xmm7
1534; SSE-NEXT:    subps %xmm6, %xmm7
1535; SSE-NEXT:    mulps %xmm2, %xmm7
1536; SSE-NEXT:    addps %xmm2, %xmm7
1537; SSE-NEXT:    mulps %xmm7, %xmm5
1538; SSE-NEXT:    movaps %xmm0, %xmm2
1539; SSE-NEXT:    subps %xmm5, %xmm2
1540; SSE-NEXT:    mulps %xmm7, %xmm2
1541; SSE-NEXT:    addps %xmm7, %xmm2
1542; SSE-NEXT:    rcpps %xmm1, %xmm5
1543; SSE-NEXT:    movaps %xmm1, %xmm6
1544; SSE-NEXT:    mulps %xmm5, %xmm6
1545; SSE-NEXT:    movaps %xmm0, %xmm7
1546; SSE-NEXT:    subps %xmm6, %xmm7
1547; SSE-NEXT:    mulps %xmm5, %xmm7
1548; SSE-NEXT:    addps %xmm5, %xmm7
1549; SSE-NEXT:    mulps %xmm7, %xmm1
1550; SSE-NEXT:    movaps %xmm0, %xmm5
1551; SSE-NEXT:    subps %xmm1, %xmm5
1552; SSE-NEXT:    mulps %xmm7, %xmm5
1553; SSE-NEXT:    addps %xmm7, %xmm5
1554; SSE-NEXT:    rcpps %xmm4, %xmm1
1555; SSE-NEXT:    movaps %xmm4, %xmm6
1556; SSE-NEXT:    mulps %xmm1, %xmm6
1557; SSE-NEXT:    movaps %xmm0, %xmm7
1558; SSE-NEXT:    subps %xmm6, %xmm7
1559; SSE-NEXT:    mulps %xmm1, %xmm7
1560; SSE-NEXT:    addps %xmm1, %xmm7
1561; SSE-NEXT:    mulps %xmm7, %xmm4
1562; SSE-NEXT:    subps %xmm4, %xmm0
1563; SSE-NEXT:    mulps %xmm7, %xmm0
1564; SSE-NEXT:    addps %xmm7, %xmm0
1565; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
1566; SSE-NEXT:    mulps {{.*}}(%rip), %xmm5
1567; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
1568; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
1569; SSE-NEXT:    movaps %xmm5, %xmm1
1570; SSE-NEXT:    retq
1571;
1572; AVX-RECIP-LABEL: v16f32_two_step2:
1573; AVX-RECIP:       # %bb.0:
1574; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1575; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1576; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1577; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1578; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1579; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1580; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1581; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm4, %ymm1
1582; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1583; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1584; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1585; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1586; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1587; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1588; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1589; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1590; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1591; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1592; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1593; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1594; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
1595; AVX-RECIP-NEXT:    retq
1596;
1597; FMA-RECIP-LABEL: v16f32_two_step2:
1598; FMA-RECIP:       # %bb.0:
1599; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1600; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
1601; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
1602; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
1603; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1604; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3
1605; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
1606; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1607; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
1608; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
1609; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
1610; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3
1611; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4
1612; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1613; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
1614; FMA-RECIP-NEXT:    retq
1615;
1616; BTVER2-LABEL: v16f32_two_step2:
1617; BTVER2:       # %bb.0:
1618; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
1619; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
1620; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
1621; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
1622; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
1623; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
1624; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
1625; BTVER2-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:2.00]
1626; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [2:2.00]
1627; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:2.00]
1628; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
1629; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
1630; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
1631; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
1632; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [2:2.00]
1633; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:2.00]
1634; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
1635; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:2.00]
1636; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [2:2.00]
1637; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
1638; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1639; BTVER2-NEXT:    retq # sched: [4:1.00]
1640;
1641; SANDY-LABEL: v16f32_two_step2:
1642; SANDY:       # %bb.0:
1643; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
1644; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
1645; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
1646; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1647; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
1648; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1649; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
1650; SANDY-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1651; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
1652; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1653; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
1654; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
1655; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1656; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
1657; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1658; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
1659; SANDY-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
1660; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
1661; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1662; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1663; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
1664; SANDY-NEXT:    retq # sched: [1:1.00]
1665;
1666; HASWELL-LABEL: v16f32_two_step2:
1667; HASWELL:       # %bb.0:
1668; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1669; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1670; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
1671; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
1672; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
1673; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm3 sched: [5:0.50]
1674; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4 sched: [5:0.50]
1675; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1676; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
1677; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
1678; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
1679; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm3 sched: [5:0.50]
1680; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm4 sched: [5:0.50]
1681; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1682; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1683; HASWELL-NEXT:    retq # sched: [7:1.00]
1684;
1685; HASWELL-NO-FMA-LABEL: v16f32_two_step2:
1686; HASWELL-NO-FMA:       # %bb.0:
1687; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
1688; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50]
1689; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
1690; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1691; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
1692; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1693; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
1694; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm4, %ymm1 # sched: [3:1.00]
1695; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
1696; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
1697; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
1698; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
1699; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
1700; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
1701; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
1702; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
1703; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm4, %ymm0 # sched: [3:1.00]
1704; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
1705; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
1706; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1707; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1708; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1709;
1710; KNL-LABEL: v16f32_two_step2:
1711; KNL:       # %bb.0:
1712; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
1713; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
1714; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
1715; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
1716; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
1717; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [5:0.50]
1718; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [5:0.50]
1719; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
1720; KNL-NEXT:    retq # sched: [7:1.00]
1721;
1722; SKX-LABEL: v16f32_two_step2:
1723; SKX:       # %bb.0:
1724; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
1725; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
1726; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
1727; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
1728; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
1729; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm2 sched: [4:0.50]
1730; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm3) + zmm3 sched: [4:0.50]
1731; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
1732; SKX-NEXT:    retq # sched: [7:1.00]
1733  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1734  ret <16 x float> %div
1735}
1736
1737define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
1738; SSE-LABEL: v16f32_no_step:
1739; SSE:       # %bb.0:
1740; SSE-NEXT:    rcpps %xmm0, %xmm0
1741; SSE-NEXT:    rcpps %xmm1, %xmm1
1742; SSE-NEXT:    rcpps %xmm2, %xmm2
1743; SSE-NEXT:    rcpps %xmm3, %xmm3
1744; SSE-NEXT:    retq
1745;
1746; AVX-RECIP-LABEL: v16f32_no_step:
1747; AVX-RECIP:       # %bb.0:
1748; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1749; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1750; AVX-RECIP-NEXT:    retq
1751;
1752; FMA-RECIP-LABEL: v16f32_no_step:
1753; FMA-RECIP:       # %bb.0:
1754; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1755; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1756; FMA-RECIP-NEXT:    retq
1757;
1758; BTVER2-LABEL: v16f32_no_step:
1759; BTVER2:       # %bb.0:
1760; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1761; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
1762; BTVER2-NEXT:    retq # sched: [4:1.00]
1763;
1764; SANDY-LABEL: v16f32_no_step:
1765; SANDY:       # %bb.0:
1766; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1767; SANDY-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
1768; SANDY-NEXT:    retq # sched: [1:1.00]
1769;
1770; HASWELL-LABEL: v16f32_no_step:
1771; HASWELL:       # %bb.0:
1772; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1773; HASWELL-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
1774; HASWELL-NEXT:    retq # sched: [7:1.00]
1775;
1776; HASWELL-NO-FMA-LABEL: v16f32_no_step:
1777; HASWELL-NO-FMA:       # %bb.0:
1778; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1779; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
1780; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1781;
1782; KNL-LABEL: v16f32_no_step:
1783; KNL:       # %bb.0:
1784; KNL-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
1785; KNL-NEXT:    retq # sched: [7:1.00]
1786;
1787; SKX-LABEL: v16f32_no_step:
1788; SKX:       # %bb.0:
1789; SKX-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
1790; SKX-NEXT:    retq # sched: [7:1.00]
1791  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1792  ret <16 x float> %div
1793}
1794
1795define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
1796; SSE-LABEL: v16f32_no_step2:
1797; SSE:       # %bb.0:
1798; SSE-NEXT:    rcpps %xmm3, %xmm3
1799; SSE-NEXT:    rcpps %xmm2, %xmm2
1800; SSE-NEXT:    rcpps %xmm1, %xmm1
1801; SSE-NEXT:    rcpps %xmm0, %xmm0
1802; SSE-NEXT:    mulps {{.*}}(%rip), %xmm0
1803; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
1804; SSE-NEXT:    mulps {{.*}}(%rip), %xmm2
1805; SSE-NEXT:    mulps {{.*}}(%rip), %xmm3
1806; SSE-NEXT:    retq
1807;
1808; AVX-RECIP-LABEL: v16f32_no_step2:
1809; AVX-RECIP:       # %bb.0:
1810; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1811; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1812; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1813; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
1814; AVX-RECIP-NEXT:    retq
1815;
1816; FMA-RECIP-LABEL: v16f32_no_step2:
1817; FMA-RECIP:       # %bb.0:
1818; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1819; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1820; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
1821; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
1822; FMA-RECIP-NEXT:    retq
1823;
1824; BTVER2-LABEL: v16f32_no_step2:
1825; BTVER2:       # %bb.0:
1826; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
1827; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
1828; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
1829; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [7:2.00]
1830; BTVER2-NEXT:    retq # sched: [4:1.00]
1831;
1832; SANDY-LABEL: v16f32_no_step2:
1833; SANDY:       # %bb.0:
1834; SANDY-NEXT:    vrcpps %ymm1, %ymm1 # sched: [7:2.00]
1835; SANDY-NEXT:    vrcpps %ymm0, %ymm0 # sched: [7:2.00]
1836; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
1837; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:1.00]
1838; SANDY-NEXT:    retq # sched: [1:1.00]
1839;
1840; HASWELL-LABEL: v16f32_no_step2:
1841; HASWELL:       # %bb.0:
1842; HASWELL-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
1843; HASWELL-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1844; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1845; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1846; HASWELL-NEXT:    retq # sched: [7:1.00]
1847;
1848; HASWELL-NO-FMA-LABEL: v16f32_no_step2:
1849; HASWELL-NO-FMA:       # %bb.0:
1850; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1 # sched: [11:2.00]
1851; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0 # sched: [11:2.00]
1852; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
1853; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [12:0.50]
1854; HASWELL-NO-FMA-NEXT:    retq # sched: [7:1.00]
1855;
1856; KNL-LABEL: v16f32_no_step2:
1857; KNL:       # %bb.0:
1858; KNL-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [11:2.00]
1859; KNL-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [12:0.50]
1860; KNL-NEXT:    retq # sched: [7:1.00]
1861;
1862; SKX-LABEL: v16f32_no_step2:
1863; SKX:       # %bb.0:
1864; SKX-NEXT:    vrcp14ps %zmm0, %zmm0 # sched: [4:2.00]
1865; SKX-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
1866; SKX-NEXT:    retq # sched: [7:1.00]
1867  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1868  ret <16 x float> %div
1869}
1870
1871attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
1872attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
1873attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
1874attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }
1875
1876