1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef BERBERIS_INTRINSICS_INTRINSICS_FLOAT_X86_H_
18 #define BERBERIS_INTRINSICS_INTRINSICS_FLOAT_X86_H_
19
20 #include <emmintrin.h>
21 #include <immintrin.h>
22 #include <math.h>
23 #include <pmmintrin.h> // _MM_DENORMALS_ZERO_ON
24 #include <xmmintrin.h> // _MM_FLUSH_ZERO_ON
25
26 #include "berberis/base/bit_util.h"
27 #include "berberis/base/logging.h"
28 #include "berberis/intrinsics/guest_fpstate.h" // FE_HOSTROUND
29
30 namespace berberis {
31
32 namespace intrinsics {
33
34 template <bool precise_nan_operations_handling>
35 class ScopedStandardFPSCRValue;
36
37 // StandardFPSCRValue does not really depend on type, but it's easier to just always use it
38 // for all types. Types except for Float32 and Float64 don't do anything;
39 template <>
40 class ScopedStandardFPSCRValue<true> {
41 public:
ScopedStandardFPSCRValue()42 ScopedStandardFPSCRValue() : saved_mxcsr_(_mm_getcsr()) {
43 // Keep exceptions disabled, set FTZ and DAZ bits.
44 _mm_setcsr(_MM_MASK_MASK | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
45 }
~ScopedStandardFPSCRValue()46 ~ScopedStandardFPSCRValue() {
47 // Keep exceptions, pick everything else from saved mask.
48 _mm_setcsr((_mm_getcsr() & _MM_EXCEPT_MASK) | saved_mxcsr_);
49 }
50
51 private:
52 uint32_t saved_mxcsr_;
53 };
54
55 template <>
56 class [[maybe_unused]] ScopedStandardFPSCRValue<false> {};
57
58 #define MAKE_BINARY_OPERATOR(guest_name, operator_name, assignment_name) \
59 \
60 inline Float32 operator operator_name(const Float32& v1, const Float32& v2) { \
61 Float32 result; \
62 asm(#guest_name "ss %2,%0" : "=x"(result.value_) : "0"(v1.value_), "x"(v2.value_)); \
63 return result; \
64 } \
65 \
66 inline Float32& operator assignment_name(Float32& v1, const Float32& v2) { \
67 asm(#guest_name "ss %2,%0" : "=x"(v1.value_) : "0"(v1.value_), "x"(v2.value_)); \
68 return v1; \
69 } \
70 \
71 inline Float64 operator operator_name(const Float64& v1, const Float64& v2) { \
72 Float64 result; \
73 asm(#guest_name "sd %2,%0" : "=x"(result.value_) : "0"(v1.value_), "x"(v2.value_)); \
74 return result; \
75 } \
76 \
77 inline Float64& operator assignment_name(Float64& v1, const Float64& v2) { \
78 asm(#guest_name "sd %2,%0" : "=x"(v1.value_) : "0"(v1.value_), "x"(v2.value_)); \
79 return v1; \
80 }
81
82 MAKE_BINARY_OPERATOR(add, +, +=)
83 MAKE_BINARY_OPERATOR(sub, -, -=)
84 MAKE_BINARY_OPERATOR(mul, *, *=)
85 MAKE_BINARY_OPERATOR(div, /, /=)
86
87 #undef MAKE_BINARY_OPERATOR
88
89 inline bool operator<(const Float32& v1, const Float32& v2) {
90 bool result;
91 asm("ucomiss %1,%2\n seta %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
92 return result;
93 }
94
95 inline bool operator<(const Float64& v1, const Float64& v2) {
96 bool result;
97 asm("ucomisd %1,%2\n seta %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
98 return result;
99 }
100
101 inline bool operator>(const Float32& v1, const Float32& v2) {
102 bool result;
103 asm("ucomiss %2,%1\n seta %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
104 return result;
105 }
106
107 inline bool operator>(const Float64& v1, const Float64& v2) {
108 bool result;
109 asm("ucomisd %2,%1\n seta %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
110 return result;
111 }
112
113 inline bool operator<=(const Float32& v1, const Float32& v2) {
114 bool result;
115 asm("ucomiss %1,%2\n setnb %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
116 return result;
117 }
118
119 inline bool operator<=(const Float64& v1, const Float64& v2) {
120 bool result;
121 asm("ucomisd %1,%2\n setnb %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
122 return result;
123 }
124
125 inline bool operator>=(const Float32& v1, const Float32& v2) {
126 bool result;
127 asm("ucomiss %2,%1\n setnb %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
128 return result;
129 }
130
131 inline bool operator>=(const Float64& v1, const Float64& v2) {
132 bool result;
133 asm("ucomisd %2,%1\n setnb %0" : "=q"(result) : "x"(v1.value_), "x"(v2.value_) : "cc");
134 return result;
135 }
136
137 inline bool operator==(const Float32& v1, const Float32& v2) {
138 float result;
139 asm("cmpeqss %2,%0" : "=x"(result) : "0"(v1.value_), "x"(v2.value_));
140 return bit_cast<uint32_t, float>(result) & 0x1;
141 }
142
143 inline bool operator==(const Float64& v1, const Float64& v2) {
144 double result;
145 asm("cmpeqsd %2,%0" : "=x"(result) : "0"(v1.value_), "x"(v2.value_));
146 return bit_cast<uint64_t, double>(result) & 0x1;
147 }
148
149 inline bool operator!=(const Float32& v1, const Float32& v2) {
150 float result;
151 asm("cmpneqss %2,%0" : "=x"(result) : "0"(v1.value_), "x"(v2.value_));
152 return bit_cast<uint32_t, float>(result) & 0x1;
153 }
154
155 inline bool operator!=(const Float64& v1, const Float64& v2) {
156 double result;
157 asm("cmpneqsd %2,%0" : "=x"(result) : "0"(v1.value_), "x"(v2.value_));
158 return bit_cast<uint64_t, double>(result) & 0x1;
159 }
160
161 // It's NOT safe to use ANY functions which return float or double. That's because IA32 ABI uses
162 // x87 stack to pass arguments (and does that even with -mfpmath=sse) and NaN float and
163 // double values would be corrupted if pushed on it.
164 //
165 // It's safe to use builtins here if that file is compiled with -mfpmath=sse (clang does not have
166 // such flag but uses SSE whenever possible, GCC needs both -msse2 and -mfpmath=sse) since builtins
167 // DON'T use an official calling conventions but are instead embedded in the function - even if all
168 // optimizations are disabled.
169
CopySignBit(const Float32 & v1,const Float32 & v2)170 inline Float32 CopySignBit(const Float32& v1, const Float32& v2) {
171 return Float32(__builtin_copysignf(v1.value_, v2.value_));
172 }
173
CopySignBit(const Float64 & v1,const Float64 & v2)174 inline Float64 CopySignBit(const Float64& v1, const Float64& v2) {
175 return Float64(__builtin_copysign(v1.value_, v2.value_));
176 }
177
Absolute(const Float32 & v)178 inline Float32 Absolute(const Float32& v) {
179 return Float32(__builtin_fabsf(v.value_));
180 }
181
Absolute(const Float64 & v)182 inline Float64 Absolute(const Float64& v) {
183 return Float64(__builtin_fabs(v.value_));
184 }
185
Negative(const Float32 & v)186 inline Float32 Negative(const Float32& v) {
187 // TODO(b/120563432): Simple -v.value_ doesn't work after a clang update.
188 Float32 result;
189 uint64_t sign_bit = 0x80000000U;
190 asm("pxor %2, %0" : "=x"(result.value_) : "0"(v.value_), "x"(sign_bit));
191 return result;
192 }
193
Negative(const Float64 & v)194 inline Float64 Negative(const Float64& v) {
195 // TODO(b/120563432): Simple -v.value_ doesn't work after a clang update.
196 Float64 result;
197 uint64_t sign_bit = 0x8000000000000000ULL;
198 asm("pxor %2, %0" : "=x"(result.value_) : "0"(v.value_), "x"(sign_bit));
199 return result;
200 }
201
FPClassify(const Float32 & v)202 inline FPInfo FPClassify(const Float32& v) {
203 return static_cast<FPInfo>(__builtin_fpclassify(static_cast<int>(FPInfo::kNaN),
204 static_cast<int>(FPInfo::kInfinite),
205 static_cast<int>(FPInfo::kNormal),
206 static_cast<int>(FPInfo::kSubnormal),
207 static_cast<int>(FPInfo::kZero),
208 v.value_));
209 }
210
FPClassify(const Float64 & v)211 inline FPInfo FPClassify(const Float64& v) {
212 return static_cast<FPInfo>(__builtin_fpclassify(static_cast<int>(FPInfo::kNaN),
213 static_cast<int>(FPInfo::kInfinite),
214 static_cast<int>(FPInfo::kNormal),
215 static_cast<int>(FPInfo::kSubnormal),
216 static_cast<int>(FPInfo::kZero),
217 v.value_));
218 }
219
FPRound(const Float32 & value,uint32_t round_control)220 inline Float32 FPRound(const Float32& value, uint32_t round_control) {
221 Float32 result;
222 switch (round_control) {
223 case FE_HOSTROUND:
224 asm("roundss $4,%1,%0" : "=x"(result.value_) : "x"(value.value_));
225 break;
226 case FE_TONEAREST:
227 asm("roundss $0,%1,%0" : "=x"(result.value_) : "x"(value.value_));
228 break;
229 case FE_DOWNWARD:
230 asm("roundss $1,%1,%0" : "=x"(result.value_) : "x"(value.value_));
231 break;
232 case FE_UPWARD:
233 asm("roundss $2,%1,%0" : "=x"(result.value_) : "x"(value.value_));
234 break;
235 case FE_TOWARDZERO:
236 asm("roundss $3,%1,%0" : "=x"(result.value_) : "x"(value.value_));
237 break;
238 case FE_TIESAWAY:
239 // TODO(b/146437763): Might fail if value doesn't have a floating part.
240 if (value == FPRound(value, FE_DOWNWARD) + Float32(0.5)) {
241 result = value > Float32(0.0) ? FPRound(value, FE_UPWARD) : FPRound(value, FE_DOWNWARD);
242 } else {
243 result = FPRound(value, FE_TONEAREST);
244 }
245 break;
246 default:
247 LOG_ALWAYS_FATAL("Internal error: unknown round_control in FPRound!");
248 result.value_ = 0.f;
249 }
250 return result;
251 }
252
FPRound(const Float64 & value,uint32_t round_control)253 inline Float64 FPRound(const Float64& value, uint32_t round_control) {
254 Float64 result;
255 switch (round_control) {
256 case FE_HOSTROUND:
257 asm("roundsd $4,%1,%0" : "=x"(result.value_) : "x"(value.value_));
258 break;
259 case FE_TONEAREST:
260 asm("roundsd $0,%1,%0" : "=x"(result.value_) : "x"(value.value_));
261 break;
262 case FE_DOWNWARD:
263 asm("roundsd $1,%1,%0" : "=x"(result.value_) : "x"(value.value_));
264 break;
265 case FE_UPWARD:
266 asm("roundsd $2,%1,%0" : "=x"(result.value_) : "x"(value.value_));
267 break;
268 case FE_TOWARDZERO:
269 asm("roundsd $3,%1,%0" : "=x"(result.value_) : "x"(value.value_));
270 break;
271 case FE_TIESAWAY:
272 // Since x86 does not support this rounding mode exactly, we must manually handle the
273 // tie-aways (from (-)x.5)
274 if (value == FPRound(value, FE_DOWNWARD)) {
275 // Value is already an integer and can be returned as-is. Checking this first avoids dealing
276 // with numbers too large to be able to have a fractional part.
277 return value;
278 } else if (value == FPRound(value, FE_DOWNWARD) + Float64(0.5)) {
279 // Fraction part is exactly 1/2, in which case we need to tie-away
280 result = value > Float64(0.0) ? FPRound(value, FE_UPWARD) : FPRound(value, FE_DOWNWARD);
281 } else {
282 // Any other case can be handled by to-nearest rounding.
283 result = FPRound(value, FE_TONEAREST);
284 }
285 break;
286 default:
287 LOG_ALWAYS_FATAL("Internal error: unknown round_control in FPRound!");
288 result.value_ = 0.;
289 }
290 return result;
291 }
292
IsNan(const Float32 & v)293 inline int IsNan(const Float32& v) {
294 return __builtin_isnan(v.value_);
295 }
296
IsNan(const Float64 & v)297 inline int IsNan(const Float64& v) {
298 return __builtin_isnan(v.value_);
299 }
300
SignBit(const Float32 & v)301 inline int SignBit(const Float32& v) {
302 return __builtin_signbitf(v.value_);
303 }
304
SignBit(const Float64 & v)305 inline int SignBit(const Float64& v) {
306 return __builtin_signbit(v.value_);
307 }
308
Sqrt(const Float32 & v)309 inline Float32 Sqrt(const Float32& v) {
310 return Float32(__builtin_sqrtf(v.value_));
311 }
312
Sqrt(const Float64 & v)313 inline Float64 Sqrt(const Float64& v) {
314 return Float64(__builtin_sqrt(v.value_));
315 }
316
317 // x*y + z
MulAdd(const Float32 & v1,const Float32 & v2,const Float32 & v3)318 inline Float32 MulAdd(const Float32& v1, const Float32& v2, const Float32& v3) {
319 return Float32(fmaf(v1.value_, v2.value_, v3.value_));
320 }
321
MulAdd(const Float64 & v1,const Float64 & v2,const Float64 & v3)322 inline Float64 MulAdd(const Float64& v1, const Float64& v2, const Float64& v3) {
323 return Float64(fma(v1.value_, v2.value_, v3.value_));
324 }
325
326 template <typename... Srcs>
AllAreNotNan(Srcs...srcs)327 bool AllAreNotNan(Srcs... srcs) {
328 for (const auto src : {srcs...}) {
329 if (IsNan(src)) {
330 return false;
331 }
332 }
333 return true;
334 }
335
336 } // namespace intrinsics
337
338 } // namespace berberis
339
340 #endif // BERBERIS_INTRINSICS_INTRINSICS_FLOAT_H_
341