Lines Matching refs:RECIP
3 …< %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-RECIP
4 …< %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefixes=AVX,FMA-RECIP
50 ; AVX-RECIP-LABEL: f32_one_step:
51 ; AVX-RECIP: # %bb.0:
52 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
53 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
54 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
55 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm2, %xmm0
56 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
57 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
58 ; AVX-RECIP-NEXT: retq
60 ; FMA-RECIP-LABEL: f32_one_step:
61 ; FMA-RECIP: # %bb.0:
62 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
63 ; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
64 ; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
65 ; FMA-RECIP-NEXT: retq
133 ; AVX-RECIP-LABEL: f32_one_step_variables:
134 ; AVX-RECIP: # %bb.0:
135 ; AVX-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2
136 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3
137 ; AVX-RECIP-NEXT: vmulss %xmm3, %xmm1, %xmm1
138 ; AVX-RECIP-NEXT: vsubss %xmm1, %xmm0, %xmm0
139 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm2, %xmm0
140 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm3, %xmm0
141 ; AVX-RECIP-NEXT: retq
143 ; FMA-RECIP-LABEL: f32_one_step_variables:
144 ; FMA-RECIP: # %bb.0:
145 ; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2
146 ; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3
147 ; FMA-RECIP-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
148 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
149 ; FMA-RECIP-NEXT: retq
226 ; AVX-RECIP-LABEL: f32_two_step:
227 ; AVX-RECIP: # %bb.0:
228 ; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
229 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2
230 ; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
231 ; AVX-RECIP-NEXT: vsubss %xmm2, %xmm3, %xmm2
232 ; AVX-RECIP-NEXT: vmulss %xmm2, %xmm1, %xmm2
233 ; AVX-RECIP-NEXT: vaddss %xmm2, %xmm1, %xmm1
234 ; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
235 ; AVX-RECIP-NEXT: vsubss %xmm0, %xmm3, %xmm0
236 ; AVX-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0
237 ; AVX-RECIP-NEXT: vaddss %xmm0, %xmm1, %xmm0
238 ; AVX-RECIP-NEXT: retq
240 ; FMA-RECIP-LABEL: f32_two_step:
241 ; FMA-RECIP: # %bb.0:
242 ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
243 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
244 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
245 ; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
246 ; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
247 ; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
248 ; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
249 ; FMA-RECIP-NEXT: retq
336 ; AVX-RECIP-LABEL: v4f32_no_estimate:
337 ; AVX-RECIP: # %bb.0:
338 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
339 ; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
340 ; AVX-RECIP-NEXT: retq
342 ; FMA-RECIP-LABEL: v4f32_no_estimate:
343 ; FMA-RECIP: # %bb.0:
344 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
345 ; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
346 ; FMA-RECIP-NEXT: retq
399 ; AVX-RECIP-LABEL: v4f32_one_step:
400 ; AVX-RECIP: # %bb.0:
401 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
402 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
403 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
404 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0
405 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
406 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
407 ; AVX-RECIP-NEXT: retq
409 ; FMA-RECIP-LABEL: v4f32_one_step:
410 ; FMA-RECIP: # %bb.0:
411 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
412 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
413 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
414 ; FMA-RECIP-NEXT: retq
493 ; AVX-RECIP-LABEL: v4f32_one_step_variables:
494 ; AVX-RECIP: # %bb.0:
495 ; AVX-RECIP-NEXT: vrcpps %xmm1, %xmm2
496 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3
497 ; AVX-RECIP-NEXT: vmulps %xmm3, %xmm1, %xmm1
498 ; AVX-RECIP-NEXT: vsubps %xmm1, %xmm0, %xmm0
499 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm2, %xmm0
500 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm3, %xmm0
501 ; AVX-RECIP-NEXT: retq
503 ; FMA-RECIP-LABEL: v4f32_one_step_variables:
504 ; FMA-RECIP: # %bb.0:
505 ; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2
506 ; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3
507 ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0
508 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
509 ; FMA-RECIP-NEXT: retq
586 ; AVX-RECIP-LABEL: v4f32_two_step:
587 ; AVX-RECIP: # %bb.0:
588 ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
589 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
590 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
591 ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2
592 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2
593 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1
594 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
595 ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm3, %xmm0
596 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0
597 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0
598 ; AVX-RECIP-NEXT: retq
600 ; FMA-RECIP-LABEL: v4f32_two_step:
601 ; FMA-RECIP: # %bb.0:
602 ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
603 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
604 ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
605 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
606 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
607 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
608 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm3) + xmm3
609 ; FMA-RECIP-NEXT: retq
699 ; AVX-RECIP-LABEL: v8f32_no_estimate:
700 ; AVX-RECIP: # %bb.0:
701 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
702 ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
703 ; AVX-RECIP-NEXT: retq
705 ; FMA-RECIP-LABEL: v8f32_no_estimate:
706 ; FMA-RECIP: # %bb.0:
707 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
708 ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
709 ; FMA-RECIP-NEXT: retq
769 ; AVX-RECIP-LABEL: v8f32_one_step:
770 ; AVX-RECIP: # %bb.0:
771 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
772 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
773 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
774 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0
775 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
776 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
777 ; AVX-RECIP-NEXT: retq
779 ; FMA-RECIP-LABEL: v8f32_one_step:
780 ; FMA-RECIP: # %bb.0:
781 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
782 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem
783 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
784 ; FMA-RECIP-NEXT: retq
882 ; AVX-RECIP-LABEL: v8f32_two_step:
883 ; AVX-RECIP: # %bb.0:
884 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
885 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
886 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
887 ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2
888 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2
889 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1
890 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
891 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
892 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0
893 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0
894 ; AVX-RECIP-NEXT: retq
896 ; FMA-RECIP-LABEL: v8f32_two_step:
897 ; FMA-RECIP: # %bb.0:
898 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
899 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
900 ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
901 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2
902 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1
903 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
904 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm3) + ymm3
905 ; FMA-RECIP-NEXT: retq
1001 ; AVX-RECIP-LABEL: v16f32_no_estimate:
1002 ; AVX-RECIP: # %bb.0:
1003 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
1004 ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
1005 ; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
1006 ; AVX-RECIP-NEXT: retq
1008 ; FMA-RECIP-LABEL: v16f32_no_estimate:
1009 ; FMA-RECIP: # %bb.0:
1010 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
1011 ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0
1012 ; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1
1013 ; FMA-RECIP-NEXT: retq
1092 ; AVX-RECIP-LABEL: v16f32_one_step:
1093 ; AVX-RECIP: # %bb.0:
1094 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1095 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1096 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
1097 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0
1098 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1099 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1100 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1101 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1102 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm3, %ymm1
1103 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1104 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1105 ; AVX-RECIP-NEXT: retq
1107 ; FMA-RECIP-LABEL: v16f32_one_step:
1108 ; FMA-RECIP: # %bb.0:
1109 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1110 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
1111 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
1112 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
1113 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1114 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3
1115 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
1116 ; FMA-RECIP-NEXT: retq
1252 ; AVX-RECIP-LABEL: v16f32_two_step:
1253 ; AVX-RECIP: # %bb.0:
1254 ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2
1255 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3
1256 ; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
1257 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1258 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1259 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1260 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0
1261 ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm4, %ymm0
1262 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0
1263 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0
1264 ; AVX-RECIP-NEXT: vrcpps %ymm1, %ymm2
1265 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm3
1266 ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3
1267 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3
1268 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2
1269 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm1
1270 ; AVX-RECIP-NEXT: vsubps %ymm1, %ymm4, %ymm1
1271 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm1
1272 ; AVX-RECIP-NEXT: vaddps %ymm1, %ymm2, %ymm1
1273 ; AVX-RECIP-NEXT: retq
1275 ; FMA-RECIP-LABEL: v16f32_two_step:
1276 ; FMA-RECIP: # %bb.0:
1277 ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2
1278 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E…
1279 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
1280 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
1281 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1282 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3
1283 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm4) + ymm4
1284 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2
1285 ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4
1286 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm1 * ymm4) - ymm3
1287 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1288 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1289 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm4) + ymm4
1290 ; FMA-RECIP-NEXT: retq