1 #include "rs_core.rsh"
2 #include "rs_f16_util.h"
3
4 extern float2 __attribute__((overloadable)) convert_float2(int2 c);
5 extern float3 __attribute__((overloadable)) convert_float3(int3 c);
6 extern float4 __attribute__((overloadable)) convert_float4(int4 c);
7
8 extern int2 __attribute__((overloadable)) convert_int2(float2 c);
9 extern int3 __attribute__((overloadable)) convert_int3(float3 c);
10 extern int4 __attribute__((overloadable)) convert_int4(float4 c);
11
12
13 extern float __attribute__((overloadable)) fmin(float v, float v2);
14 extern float2 __attribute__((overloadable)) fmin(float2 v, float v2);
15 extern float3 __attribute__((overloadable)) fmin(float3 v, float v2);
16 extern float4 __attribute__((overloadable)) fmin(float4 v, float v2);
17
18 extern float __attribute__((overloadable)) fmax(float v, float v2);
19 extern float2 __attribute__((overloadable)) fmax(float2 v, float v2);
20 extern float3 __attribute__((overloadable)) fmax(float3 v, float v2);
21 extern float4 __attribute__((overloadable)) fmax(float4 v, float v2);
22
23 // Float ops, 6.11.2
24
25 #define FN_FUNC_FN(fnc) \
26 extern float2 __attribute__((overloadable)) fnc(float2 v) { \
27 float2 r; \
28 r.x = fnc(v.x); \
29 r.y = fnc(v.y); \
30 return r; \
31 } \
32 extern float3 __attribute__((overloadable)) fnc(float3 v) { \
33 float3 r; \
34 r.x = fnc(v.x); \
35 r.y = fnc(v.y); \
36 r.z = fnc(v.z); \
37 return r; \
38 } \
39 extern float4 __attribute__((overloadable)) fnc(float4 v) { \
40 float4 r; \
41 r.x = fnc(v.x); \
42 r.y = fnc(v.y); \
43 r.z = fnc(v.z); \
44 r.w = fnc(v.w); \
45 return r; \
46 }
47
48 #define IN_FUNC_FN(fnc) \
49 extern int2 __attribute__((overloadable)) fnc(float2 v) { \
50 int2 r; \
51 r.x = fnc(v.x); \
52 r.y = fnc(v.y); \
53 return r; \
54 } \
55 extern int3 __attribute__((overloadable)) fnc(float3 v) { \
56 int3 r; \
57 r.x = fnc(v.x); \
58 r.y = fnc(v.y); \
59 r.z = fnc(v.z); \
60 return r; \
61 } \
62 extern int4 __attribute__((overloadable)) fnc(float4 v) { \
63 int4 r; \
64 r.x = fnc(v.x); \
65 r.y = fnc(v.y); \
66 r.z = fnc(v.z); \
67 r.w = fnc(v.w); \
68 return r; \
69 }
70
71 #define FN_FUNC_FN_FN(fnc) \
72 extern float2 __attribute__((overloadable)) fnc(float2 v1, float2 v2) { \
73 float2 r; \
74 r.x = fnc(v1.x, v2.x); \
75 r.y = fnc(v1.y, v2.y); \
76 return r; \
77 } \
78 extern float3 __attribute__((overloadable)) fnc(float3 v1, float3 v2) { \
79 float3 r; \
80 r.x = fnc(v1.x, v2.x); \
81 r.y = fnc(v1.y, v2.y); \
82 r.z = fnc(v1.z, v2.z); \
83 return r; \
84 } \
85 extern float4 __attribute__((overloadable)) fnc(float4 v1, float4 v2) { \
86 float4 r; \
87 r.x = fnc(v1.x, v2.x); \
88 r.y = fnc(v1.y, v2.y); \
89 r.z = fnc(v1.z, v2.z); \
90 r.w = fnc(v1.w, v2.w); \
91 return r; \
92 }
93
94 #define FN_FUNC_FN_F(fnc) \
95 extern float2 __attribute__((overloadable)) fnc(float2 v1, float v2) { \
96 float2 r; \
97 r.x = fnc(v1.x, v2); \
98 r.y = fnc(v1.y, v2); \
99 return r; \
100 } \
101 extern float3 __attribute__((overloadable)) fnc(float3 v1, float v2) { \
102 float3 r; \
103 r.x = fnc(v1.x, v2); \
104 r.y = fnc(v1.y, v2); \
105 r.z = fnc(v1.z, v2); \
106 return r; \
107 } \
108 extern float4 __attribute__((overloadable)) fnc(float4 v1, float v2) { \
109 float4 r; \
110 r.x = fnc(v1.x, v2); \
111 r.y = fnc(v1.y, v2); \
112 r.z = fnc(v1.z, v2); \
113 r.w = fnc(v1.w, v2); \
114 return r; \
115 }
116
117 #define FN_FUNC_FN_IN(fnc) \
118 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 v2) { \
119 float2 r; \
120 r.x = fnc(v1.x, v2.x); \
121 r.y = fnc(v1.y, v2.y); \
122 return r; \
123 } \
124 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 v2) { \
125 float3 r; \
126 r.x = fnc(v1.x, v2.x); \
127 r.y = fnc(v1.y, v2.y); \
128 r.z = fnc(v1.z, v2.z); \
129 return r; \
130 } \
131 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 v2) { \
132 float4 r; \
133 r.x = fnc(v1.x, v2.x); \
134 r.y = fnc(v1.y, v2.y); \
135 r.z = fnc(v1.z, v2.z); \
136 r.w = fnc(v1.w, v2.w); \
137 return r; \
138 }
139
140 #define FN_FUNC_FN_I(fnc) \
141 extern float2 __attribute__((overloadable)) fnc(float2 v1, int v2) { \
142 float2 r; \
143 r.x = fnc(v1.x, v2); \
144 r.y = fnc(v1.y, v2); \
145 return r; \
146 } \
147 extern float3 __attribute__((overloadable)) fnc(float3 v1, int v2) { \
148 float3 r; \
149 r.x = fnc(v1.x, v2); \
150 r.y = fnc(v1.y, v2); \
151 r.z = fnc(v1.z, v2); \
152 return r; \
153 } \
154 extern float4 __attribute__((overloadable)) fnc(float4 v1, int v2) { \
155 float4 r; \
156 r.x = fnc(v1.x, v2); \
157 r.y = fnc(v1.y, v2); \
158 r.z = fnc(v1.z, v2); \
159 r.w = fnc(v1.w, v2); \
160 return r; \
161 }
162
163 #define FN_FUNC_FN_PFN(fnc) \
164 extern float2 __attribute__((overloadable)) \
165 fnc(float2 v1, float2 *v2) { \
166 float2 r; \
167 float t[2]; \
168 r.x = fnc(v1.x, &t[0]); \
169 r.y = fnc(v1.y, &t[1]); \
170 v2->x = t[0]; \
171 v2->y = t[1]; \
172 return r; \
173 } \
174 extern float3 __attribute__((overloadable)) \
175 fnc(float3 v1, float3 *v2) { \
176 float3 r; \
177 float t[3]; \
178 r.x = fnc(v1.x, &t[0]); \
179 r.y = fnc(v1.y, &t[1]); \
180 r.z = fnc(v1.z, &t[2]); \
181 v2->x = t[0]; \
182 v2->y = t[1]; \
183 v2->z = t[2]; \
184 return r; \
185 } \
186 extern float4 __attribute__((overloadable)) \
187 fnc(float4 v1, float4 *v2) { \
188 float4 r; \
189 float t[4]; \
190 r.x = fnc(v1.x, &t[0]); \
191 r.y = fnc(v1.y, &t[1]); \
192 r.z = fnc(v1.z, &t[2]); \
193 r.w = fnc(v1.w, &t[3]); \
194 v2->x = t[0]; \
195 v2->y = t[1]; \
196 v2->z = t[2]; \
197 v2->w = t[3]; \
198 return r; \
199 }
200
201 #define FN_FUNC_FN_PIN(fnc) \
202 extern float2 __attribute__((overloadable)) fnc(float2 v1, int2 *v2) { \
203 float2 r; \
204 int t[2]; \
205 r.x = fnc(v1.x, &t[0]); \
206 r.y = fnc(v1.y, &t[1]); \
207 v2->x = t[0]; \
208 v2->y = t[1]; \
209 return r; \
210 } \
211 extern float3 __attribute__((overloadable)) fnc(float3 v1, int3 *v2) { \
212 float3 r; \
213 int t[3]; \
214 r.x = fnc(v1.x, &t[0]); \
215 r.y = fnc(v1.y, &t[1]); \
216 r.z = fnc(v1.z, &t[2]); \
217 v2->x = t[0]; \
218 v2->y = t[1]; \
219 v2->z = t[2]; \
220 return r; \
221 } \
222 extern float4 __attribute__((overloadable)) fnc(float4 v1, int4 *v2) { \
223 float4 r; \
224 int t[4]; \
225 r.x = fnc(v1.x, &t[0]); \
226 r.y = fnc(v1.y, &t[1]); \
227 r.z = fnc(v1.z, &t[2]); \
228 r.w = fnc(v1.w, &t[3]); \
229 v2->x = t[0]; \
230 v2->y = t[1]; \
231 v2->z = t[2]; \
232 v2->w = t[3]; \
233 return r; \
234 }
235
236 #define FN_FUNC_FN_FN_FN(fnc) \
237 extern float2 __attribute__((overloadable)) \
238 fnc(float2 v1, float2 v2, float2 v3) { \
239 float2 r; \
240 r.x = fnc(v1.x, v2.x, v3.x); \
241 r.y = fnc(v1.y, v2.y, v3.y); \
242 return r; \
243 } \
244 extern float3 __attribute__((overloadable)) \
245 fnc(float3 v1, float3 v2, float3 v3) { \
246 float3 r; \
247 r.x = fnc(v1.x, v2.x, v3.x); \
248 r.y = fnc(v1.y, v2.y, v3.y); \
249 r.z = fnc(v1.z, v2.z, v3.z); \
250 return r; \
251 } \
252 extern float4 __attribute__((overloadable)) \
253 fnc(float4 v1, float4 v2, float4 v3) { \
254 float4 r; \
255 r.x = fnc(v1.x, v2.x, v3.x); \
256 r.y = fnc(v1.y, v2.y, v3.y); \
257 r.z = fnc(v1.z, v2.z, v3.z); \
258 r.w = fnc(v1.w, v2.w, v3.w); \
259 return r; \
260 }
261
262 #define FN_FUNC_FN_FN_PIN(fnc) \
263 extern float2 __attribute__((overloadable)) \
264 fnc(float2 v1, float2 v2, int2 *v3) { \
265 float2 r; \
266 int t[2]; \
267 r.x = fnc(v1.x, v2.x, &t[0]); \
268 r.y = fnc(v1.y, v2.y, &t[1]); \
269 v3->x = t[0]; \
270 v3->y = t[1]; \
271 return r; \
272 } \
273 extern float3 __attribute__((overloadable)) \
274 fnc(float3 v1, float3 v2, int3 *v3) { \
275 float3 r; \
276 int t[3]; \
277 r.x = fnc(v1.x, v2.x, &t[0]); \
278 r.y = fnc(v1.y, v2.y, &t[1]); \
279 r.z = fnc(v1.z, v2.z, &t[2]); \
280 v3->x = t[0]; \
281 v3->y = t[1]; \
282 v3->z = t[2]; \
283 return r; \
284 } \
285 extern float4 __attribute__((overloadable)) \
286 fnc(float4 v1, float4 v2, int4 *v3) { \
287 float4 r; \
288 int t[4]; \
289 r.x = fnc(v1.x, v2.x, &t[0]); \
290 r.y = fnc(v1.y, v2.y, &t[1]); \
291 r.z = fnc(v1.z, v2.z, &t[2]); \
292 r.w = fnc(v1.w, v2.w, &t[3]); \
293 v3->x = t[0]; \
294 v3->y = t[1]; \
295 v3->z = t[2]; \
296 v3->w = t[3]; \
297 return r; \
298 }
299
300 static const unsigned int iposinf = 0x7f800000;
301 static const unsigned int ineginf = 0xff800000;
302
posinf()303 static float posinf() {
304 float f = *((float*)&iposinf);
305 return f;
306 }
307
float_bits(float f)308 static unsigned int float_bits(float f) {
309 /* TODO(jeanluc) Use this better approach once the Mac(SDK) build issues are fixed.
310 // Get the bits while following the strict aliasing rules.
311 unsigned int result;
312 memcpy(&result, &f, sizeof(f));
313 return result;
314 */
315 return *(unsigned int*)(char*)(&f);
316 }
317
isinf(float f)318 static bool isinf(float f) {
319 unsigned int i = float_bits(f);
320 return (i == iposinf) || (i == ineginf);
321 }
322
isnan(float f)323 static bool isnan(float f) {
324 unsigned int i = float_bits(f);
325 return (((i & 0x7f800000) == 0x7f800000) && (i & 0x007fffff));
326 }
327
isposzero(float f)328 static bool isposzero(float f) {
329 return (float_bits(f) == 0x00000000);
330 }
331
isnegzero(float f)332 static bool isnegzero(float f) {
333 return (float_bits(f) == 0x80000000);
334 }
335
iszero(float f)336 static bool iszero(float f) {
337 return isposzero(f) || isnegzero(f);
338 }
339
340
341 extern float __attribute__((overloadable)) SC_acosf(float);
acos(float v)342 float __attribute__((overloadable)) acos(float v) {
343 return SC_acosf(v);
344 }
345 FN_FUNC_FN(acos)
346
347 extern float __attribute__((overloadable)) SC_acoshf(float);
acosh(float v)348 float __attribute__((overloadable)) acosh(float v) {
349 return SC_acoshf(v);
350 }
FN_FUNC_FN(acosh)351 FN_FUNC_FN(acosh)
352
353
354 extern float __attribute__((overloadable)) acospi(float v) {
355 return acos(v) / M_PI;
356 }
357 FN_FUNC_FN(acospi)
358
359 extern float __attribute__((overloadable)) SC_asinf(float);
asin(float v)360 float __attribute__((overloadable)) asin(float v) {
361 return SC_asinf(v);
362 }
363 FN_FUNC_FN(asin)
364
365 extern float __attribute__((overloadable)) SC_asinhf(float);
asinh(float v)366 float __attribute__((overloadable)) asinh(float v) {
367 return SC_asinhf(v);
368 }
FN_FUNC_FN(asinh)369 FN_FUNC_FN(asinh)
370
371 extern float __attribute__((overloadable)) asinpi(float v) {
372 return asin(v) / M_PI;
373 }
374 FN_FUNC_FN(asinpi)
375
376 extern float __attribute__((overloadable)) SC_atanf(float);
atan(float v)377 float __attribute__((overloadable)) atan(float v) {
378 return SC_atanf(v);
379 }
380 FN_FUNC_FN(atan)
381
382 extern float __attribute__((overloadable)) SC_atan2f(float, float);
atan2(float v1,float v2)383 float __attribute__((overloadable)) atan2(float v1, float v2) {
384 return SC_atan2f(v1, v2);
385 }
386 FN_FUNC_FN_FN(atan2)
387
388 extern float __attribute__((overloadable)) SC_atanhf(float);
atanh(float v)389 float __attribute__((overloadable)) atanh(float v) {
390 return SC_atanhf(v);
391 }
FN_FUNC_FN(atanh)392 FN_FUNC_FN(atanh)
393
394 extern float __attribute__((overloadable)) atanpi(float v) {
395 return atan(v) / M_PI;
396 }
FN_FUNC_FN(atanpi)397 FN_FUNC_FN(atanpi)
398
399
400 extern float __attribute__((overloadable)) atan2pi(float y, float x) {
401 return atan2(y, x) / M_PI;
402 }
403 FN_FUNC_FN_FN(atan2pi)
404
405 extern float __attribute__((overloadable)) SC_cbrtf(float);
cbrt(float v)406 float __attribute__((overloadable)) cbrt(float v) {
407 return SC_cbrtf(v);
408 }
409 FN_FUNC_FN(cbrt)
410
411 extern float __attribute__((overloadable)) SC_ceilf(float);
ceil(float v)412 float __attribute__((overloadable)) ceil(float v) {
413 return SC_ceilf(v);
414 }
415 FN_FUNC_FN(ceil)
416
417 extern float __attribute__((overloadable)) SC_copysignf(float, float);
copysign(float v1,float v2)418 float __attribute__((overloadable)) copysign(float v1, float v2) {
419 return SC_copysignf(v1, v2);
420 }
421 FN_FUNC_FN_FN(copysign)
422
423 extern float __attribute__((overloadable)) SC_cosf(float);
cos(float v)424 float __attribute__((overloadable)) cos(float v) {
425 return SC_cosf(v);
426 }
427 FN_FUNC_FN(cos)
428
429 extern float __attribute__((overloadable)) SC_coshf(float);
cosh(float v)430 float __attribute__((overloadable)) cosh(float v) {
431 return SC_coshf(v);
432 }
FN_FUNC_FN(cosh)433 FN_FUNC_FN(cosh)
434
435 extern float __attribute__((overloadable)) cospi(float v) {
436 return cos(v * M_PI);
437 }
438 FN_FUNC_FN(cospi)
439
440 extern float __attribute__((overloadable)) SC_erfcf(float);
erfc(float v)441 float __attribute__((overloadable)) erfc(float v) {
442 return SC_erfcf(v);
443 }
444 FN_FUNC_FN(erfc)
445
446 extern float __attribute__((overloadable)) SC_erff(float);
erf(float v)447 float __attribute__((overloadable)) erf(float v) {
448 return SC_erff(v);
449 }
450 FN_FUNC_FN(erf)
451
452 extern float __attribute__((overloadable)) SC_expf(float);
exp(float v)453 float __attribute__((overloadable)) exp(float v) {
454 return SC_expf(v);
455 }
456 FN_FUNC_FN(exp)
457
458 extern float __attribute__((overloadable)) SC_exp2f(float);
exp2(float v)459 float __attribute__((overloadable)) exp2(float v) {
460 return SC_exp2f(v);
461 }
462 FN_FUNC_FN(exp2)
463
464 extern float __attribute__((overloadable)) pow(float, float);
465
exp10(float v)466 extern float __attribute__((overloadable)) exp10(float v) {
467 return exp2(v * 3.321928095f);
468 }
469 FN_FUNC_FN(exp10)
470
471 extern float __attribute__((overloadable)) SC_expm1f(float);
expm1(float v)472 float __attribute__((overloadable)) expm1(float v) {
473 return SC_expm1f(v);
474 }
FN_FUNC_FN(expm1)475 FN_FUNC_FN(expm1)
476
477 extern float __attribute__((overloadable)) fabs(float v) {
478 int i = *((int*)(void*)&v) & 0x7fffffff;
479 return *((float*)(void*)&i);
480 }
481 FN_FUNC_FN(fabs)
482
483 extern float __attribute__((overloadable)) SC_fdimf(float, float);
fdim(float v1,float v2)484 float __attribute__((overloadable)) fdim(float v1, float v2) {
485 return SC_fdimf(v1, v2);
486 }
487 FN_FUNC_FN_FN(fdim)
488
489 extern float __attribute__((overloadable)) SC_floorf(float);
floor(float v)490 float __attribute__((overloadable)) floor(float v) {
491 return SC_floorf(v);
492 }
493 FN_FUNC_FN(floor)
494
495 extern float __attribute__((overloadable)) SC_fmaf(float, float, float);
fma(float v1,float v2,float v3)496 float __attribute__((overloadable)) fma(float v1, float v2, float v3) {
497 return SC_fmaf(v1, v2, v3);
498 }
499 FN_FUNC_FN_FN_FN(fma)
500
501 extern float __attribute__((overloadable)) SC_fminf(float, float);
502
503 extern float __attribute__((overloadable)) SC_fmodf(float, float);
fmod(float v1,float v2)504 float __attribute__((overloadable)) fmod(float v1, float v2) {
505 return SC_fmodf(v1, v2);
506 }
FN_FUNC_FN_FN(fmod)507 FN_FUNC_FN_FN(fmod)
508
509 extern float __attribute__((overloadable)) fract(float v, float *iptr) {
510 int i = (int)floor(v);
511 if (iptr) {
512 iptr[0] = i;
513 }
514 return fmin(v - i, 0x1.fffffep-1f);
515 }
FN_FUNC_FN_PFN(fract)516 FN_FUNC_FN_PFN(fract)
517
518 extern float __attribute__((const, overloadable)) fract(float v) {
519 float unused;
520 return fract(v, &unused);
521 }
522 FN_FUNC_FN(fract)
523
524 extern float __attribute__((overloadable)) SC_frexpf(float, int *);
frexp(float v1,int * v2)525 float __attribute__((overloadable)) frexp(float v1, int* v2) {
526 return SC_frexpf(v1, v2);
527 }
528 FN_FUNC_FN_PIN(frexp)
529
530 extern float __attribute__((overloadable)) SC_hypotf(float, float);
hypot(float v1,float v2)531 float __attribute__((overloadable)) hypot(float v1, float v2) {
532 return SC_hypotf(v1, v2);
533 }
534 FN_FUNC_FN_FN(hypot)
535
536 extern int __attribute__((overloadable)) SC_ilogbf(float);
ilogb(float v)537 int __attribute__((overloadable)) ilogb(float v) {
538 return SC_ilogbf(v);
539 }
540 IN_FUNC_FN(ilogb)
541
542 extern float __attribute__((overloadable)) SC_ldexpf(float, int);
ldexp(float v1,int v2)543 float __attribute__((overloadable)) ldexp(float v1, int v2) {
544 return SC_ldexpf(v1, v2);
545 }
546 FN_FUNC_FN_IN(ldexp)
547 FN_FUNC_FN_I(ldexp)
548
549 extern float __attribute__((overloadable)) SC_lgammaf(float);
lgamma(float v)550 float __attribute__((overloadable)) lgamma(float v) {
551 return SC_lgammaf(v);
552 }
553 FN_FUNC_FN(lgamma)
554 extern float __attribute__((overloadable)) SC_lgammaf_r(float, int*);
lgamma(float v,int * ptr)555 float __attribute__((overloadable)) lgamma(float v, int* ptr) {
556 return SC_lgammaf_r(v, ptr);
557 }
558 FN_FUNC_FN_PIN(lgamma)
559
560 extern float __attribute__((overloadable)) SC_logf(float);
log(float v)561 float __attribute__((overloadable)) log(float v) {
562 return SC_logf(v);
563 }
564 FN_FUNC_FN(log)
565
566 extern float __attribute__((overloadable)) SC_log10f(float);
log10(float v)567 float __attribute__((overloadable)) log10(float v) {
568 return SC_log10f(v);
569 }
FN_FUNC_FN(log10)570 FN_FUNC_FN(log10)
571
572
573 extern float __attribute__((overloadable)) log2(float v) {
574 return log10(v) * 3.321928095f;
575 }
576 FN_FUNC_FN(log2)
577
578 extern float __attribute__((overloadable)) SC_log1pf(float);
log1p(float v)579 float __attribute__((overloadable)) log1p(float v) {
580 return SC_log1pf(v);
581 }
582 FN_FUNC_FN(log1p)
583
584 extern float __attribute__((overloadable)) SC_logbf(float);
logb(float v)585 float __attribute__((overloadable)) logb(float v) {
586 return SC_logbf(v);
587 }
FN_FUNC_FN(logb)588 FN_FUNC_FN(logb)
589
590 extern float __attribute__((overloadable)) mad(float a, float b, float c) {
591 return a * b + c;
592 }
mad(float2 a,float2 b,float2 c)593 extern float2 __attribute__((overloadable)) mad(float2 a, float2 b, float2 c) {
594 return a * b + c;
595 }
mad(float3 a,float3 b,float3 c)596 extern float3 __attribute__((overloadable)) mad(float3 a, float3 b, float3 c) {
597 return a * b + c;
598 }
mad(float4 a,float4 b,float4 c)599 extern float4 __attribute__((overloadable)) mad(float4 a, float4 b, float4 c) {
600 return a * b + c;
601 }
602
603 extern float __attribute__((overloadable)) SC_modff(float, float *);
modf(float v1,float * v2)604 float __attribute__((overloadable)) modf(float v1, float *v2) {
605 return SC_modff(v1, v2);
606 }
607 FN_FUNC_FN_PFN(modf);
608
nan(uint v)609 extern float __attribute__((overloadable)) nan(uint v) {
610 float f[1];
611 uint32_t *ip = (uint32_t *)f;
612 *ip = v | 0x7fc00000;
613 return f[0];
614 }
615
616 extern float __attribute__((overloadable)) SC_nextafterf(float, float);
nextafter(float v1,float v2)617 float __attribute__((overloadable)) nextafter(float v1, float v2) {
618 return SC_nextafterf(v1, v2);
619 }
620 FN_FUNC_FN_FN(nextafter)
621
622 // This function must be defined here if we're compiling with debug info
623 // (libclcore_g.bc), because we need a C source to get debug information.
624 // Otherwise the implementation can be found in IR.
625 #if defined(RS_G_RUNTIME)
626 extern float __attribute__((overloadable)) SC_powf(float, float);
pow(float v1,float v2)627 float __attribute__((overloadable)) pow(float v1, float v2) {
628 return SC_powf(v1, v2);
629 }
630 #endif // defined(RS_G_RUNTIME)
FN_FUNC_FN_FN(pow)631 FN_FUNC_FN_FN(pow)
632
633 extern float __attribute__((overloadable)) pown(float v, int p) {
634 /* The mantissa of a float has fewer bits than an int (24 effective vs. 31).
635 * For very large ints, we'll lose whether the exponent is even or odd, making
636 * the selection of a correct sign incorrect. We correct this. Use copysign
637 * to handle the negative zero case.
638 */
639 float sign = (p & 0x1) ? copysign(1.f, v) : 1.f;
640 float f = pow(v, (float)p);
641 return copysign(f, sign);
642 }
FN_FUNC_FN_IN(pown)643 FN_FUNC_FN_IN(pown)
644
645 extern float __attribute__((overloadable)) powr(float v, float p) {
646 return pow(v, p);
647 }
powr(float2 v,float2 p)648 extern float2 __attribute__((overloadable)) powr(float2 v, float2 p) {
649 return pow(v, p);
650 }
powr(float3 v,float3 p)651 extern float3 __attribute__((overloadable)) powr(float3 v, float3 p) {
652 return pow(v, p);
653 }
powr(float4 v,float4 p)654 extern float4 __attribute__((overloadable)) powr(float4 v, float4 p) {
655 return pow(v, p);
656 }
657
658 extern float __attribute__((overloadable)) SC_remainderf(float, float);
remainder(float v1,float v2)659 float __attribute__((overloadable)) remainder(float v1, float v2) {
660 return SC_remainderf(v1, v2);
661 }
662 FN_FUNC_FN_FN(remainder)
663
664 extern float __attribute__((overloadable)) SC_remquof(float, float, int *);
remquo(float v1,float v2,int * v3)665 float __attribute__((overloadable)) remquo(float v1, float v2, int *v3) {
666 return SC_remquof(v1, v2, v3);
667 }
668 FN_FUNC_FN_FN_PIN(remquo)
669
670 extern float __attribute__((overloadable)) SC_rintf(float);
rint(float v)671 float __attribute__((overloadable)) rint(float v) {
672 return SC_rintf(v);
673 }
FN_FUNC_FN(rint)674 FN_FUNC_FN(rint)
675
676 extern float __attribute__((overloadable)) rootn(float v, int r) {
677 if (r == 0) {
678 return posinf();
679 }
680
681 if (iszero(v)) {
682 if (r < 0) {
683 if (r & 1) {
684 return copysign(posinf(), v);
685 } else {
686 return posinf();
687 }
688 } else {
689 if (r & 1) {
690 return copysign(0.f, v);
691 } else {
692 return 0.f;
693 }
694 }
695 }
696
697 if (!isinf(v) && !isnan(v) && (v < 0.f)) {
698 if (r & 1) {
699 return (-1.f * pow(-1.f * v, 1.f / r));
700 } else {
701 return nan(0);
702 }
703 }
704
705 return pow(v, 1.f / r);
706 }
707 FN_FUNC_FN_IN(rootn);
708
709 extern float __attribute__((overloadable)) SC_roundf(float);
round(float v)710 float __attribute__((overloadable)) round(float v) {
711 return SC_roundf(v);
712 }
713 FN_FUNC_FN(round)
714
715 extern float __attribute__((overloadable)) SC_randf2(float, float);
rsRand(float min,float max)716 float __attribute__((overloadable)) rsRand(float min, float max) {
717 return SC_randf2(min, max);
718 }
719
720
rsqrt(float v)721 extern float __attribute__((overloadable)) rsqrt(float v) {
722 return 1.f / sqrt(v);
723 }
724
725 #if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
726 // These functions must be defined here if we are not using the SSE
727 // implementation, which includes when we are built as part of the
728 // debug runtime (libclcore_debug.bc) or compiling with debug info.
729 #if defined(RS_G_RUNTIME)
730 extern float __attribute__((overloadable)) SC_sqrtf(float);
sqrt(float v)731 float __attribute__((overloadable)) sqrt(float v) {
732 return SC_sqrtf(v);
733 }
734 #endif // defined(RS_G_RUNTIME)
735
736 FN_FUNC_FN(sqrt)
737 #else
738 extern float2 __attribute__((overloadable)) sqrt(float2);
739 extern float3 __attribute__((overloadable)) sqrt(float3);
740 extern float4 __attribute__((overloadable)) sqrt(float4);
741 #endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
742
743 FN_FUNC_FN(rsqrt)
744
745 extern float __attribute__((overloadable)) SC_sinf(float);
sin(float v)746 float __attribute__((overloadable)) sin(float v) {
747 return SC_sinf(v);
748 }
FN_FUNC_FN(sin)749 FN_FUNC_FN(sin)
750
751 extern float __attribute__((overloadable)) sincos(float v, float *cosptr) {
752 *cosptr = cos(v);
753 return sin(v);
754 }
sincos(float2 v,float2 * cosptr)755 extern float2 __attribute__((overloadable)) sincos(float2 v, float2 *cosptr) {
756 *cosptr = cos(v);
757 return sin(v);
758 }
sincos(float3 v,float3 * cosptr)759 extern float3 __attribute__((overloadable)) sincos(float3 v, float3 *cosptr) {
760 *cosptr = cos(v);
761 return sin(v);
762 }
sincos(float4 v,float4 * cosptr)763 extern float4 __attribute__((overloadable)) sincos(float4 v, float4 *cosptr) {
764 *cosptr = cos(v);
765 return sin(v);
766 }
767
768 extern float __attribute__((overloadable)) SC_sinhf(float);
sinh(float v)769 float __attribute__((overloadable)) sinh(float v) {
770 return SC_sinhf(v);
771 }
FN_FUNC_FN(sinh)772 FN_FUNC_FN(sinh)
773
774 extern float __attribute__((overloadable)) sinpi(float v) {
775 return sin(v * M_PI);
776 }
777 FN_FUNC_FN(sinpi)
778
779 extern float __attribute__((overloadable)) SC_tanf(float);
tan(float v)780 float __attribute__((overloadable)) tan(float v) {
781 return SC_tanf(v);
782 }
783 FN_FUNC_FN(tan)
784
785 extern float __attribute__((overloadable)) SC_tanhf(float);
tanh(float v)786 float __attribute__((overloadable)) tanh(float v) {
787 return SC_tanhf(v);
788 }
FN_FUNC_FN(tanh)789 FN_FUNC_FN(tanh)
790
791 extern float __attribute__((overloadable)) tanpi(float v) {
792 return tan(v * M_PI);
793 }
794 FN_FUNC_FN(tanpi)
795
796
797 extern float __attribute__((overloadable)) SC_tgammaf(float);
tgamma(float v)798 float __attribute__((overloadable)) tgamma(float v) {
799 return SC_tgammaf(v);
800 }
801 FN_FUNC_FN(tgamma)
802
803 extern float __attribute__((overloadable)) SC_truncf(float);
trunc(float v)804 float __attribute__((overloadable)) trunc(float v) {
805 return SC_truncf(v);
806 }
FN_FUNC_FN(trunc)807 FN_FUNC_FN(trunc)
808
809 // Int ops (partial), 6.11.3
810
811 #define XN_FUNC_YN(typeout, fnc, typein) \
812 extern typeout __attribute__((overloadable)) fnc(typein); \
813 extern typeout##2 __attribute__((overloadable)) fnc(typein##2 v) { \
814 typeout##2 r; \
815 r.x = fnc(v.x); \
816 r.y = fnc(v.y); \
817 return r; \
818 } \
819 extern typeout##3 __attribute__((overloadable)) fnc(typein##3 v) { \
820 typeout##3 r; \
821 r.x = fnc(v.x); \
822 r.y = fnc(v.y); \
823 r.z = fnc(v.z); \
824 return r; \
825 } \
826 extern typeout##4 __attribute__((overloadable)) fnc(typein##4 v) { \
827 typeout##4 r; \
828 r.x = fnc(v.x); \
829 r.y = fnc(v.y); \
830 r.z = fnc(v.z); \
831 r.w = fnc(v.w); \
832 return r; \
833 }
834
835
836 #define UIN_FUNC_IN(fnc) \
837 XN_FUNC_YN(uchar, fnc, char) \
838 XN_FUNC_YN(ushort, fnc, short) \
839 XN_FUNC_YN(uint, fnc, int)
840
841 #define IN_FUNC_IN(fnc) \
842 XN_FUNC_YN(uchar, fnc, uchar) \
843 XN_FUNC_YN(char, fnc, char) \
844 XN_FUNC_YN(ushort, fnc, ushort) \
845 XN_FUNC_YN(short, fnc, short) \
846 XN_FUNC_YN(uint, fnc, uint) \
847 XN_FUNC_YN(int, fnc, int)
848
849
850 #define XN_FUNC_XN_XN_BODY(type, fnc, body) \
851 extern type __attribute__((overloadable)) \
852 fnc(type v1, type v2) { \
853 return body; \
854 } \
855 extern type##2 __attribute__((overloadable)) \
856 fnc(type##2 v1, type##2 v2) { \
857 type##2 r; \
858 r.x = fnc(v1.x, v2.x); \
859 r.y = fnc(v1.y, v2.y); \
860 return r; \
861 } \
862 extern type##3 __attribute__((overloadable)) \
863 fnc(type##3 v1, type##3 v2) { \
864 type##3 r; \
865 r.x = fnc(v1.x, v2.x); \
866 r.y = fnc(v1.y, v2.y); \
867 r.z = fnc(v1.z, v2.z); \
868 return r; \
869 } \
870 extern type##4 __attribute__((overloadable)) \
871 fnc(type##4 v1, type##4 v2) { \
872 type##4 r; \
873 r.x = fnc(v1.x, v2.x); \
874 r.y = fnc(v1.y, v2.y); \
875 r.z = fnc(v1.z, v2.z); \
876 r.w = fnc(v1.w, v2.w); \
877 return r; \
878 }
879
880 #define IN_FUNC_IN_IN_BODY(fnc, body) \
881 XN_FUNC_XN_XN_BODY(uchar, fnc, body) \
882 XN_FUNC_XN_XN_BODY(char, fnc, body) \
883 XN_FUNC_XN_XN_BODY(ushort, fnc, body) \
884 XN_FUNC_XN_XN_BODY(short, fnc, body) \
885 XN_FUNC_XN_XN_BODY(uint, fnc, body) \
886 XN_FUNC_XN_XN_BODY(int, fnc, body) \
887 XN_FUNC_XN_XN_BODY(float, fnc, body)
888
889
890 /**
891 * abs
892 */
893 extern uint32_t __attribute__((overloadable)) abs(int32_t v) {
894 if (v < 0)
895 return -v;
896 return v;
897 }
abs(int16_t v)898 extern uint16_t __attribute__((overloadable)) abs(int16_t v) {
899 if (v < 0)
900 return -v;
901 return v;
902 }
abs(int8_t v)903 extern uint8_t __attribute__((overloadable)) abs(int8_t v) {
904 if (v < 0)
905 return -v;
906 return v;
907 }
908
909 /**
910 * clz
911 * __builtin_clz only accepts a 32-bit unsigned int, so every input will be
912 * expanded to 32 bits. For our smaller data types, we need to subtract off
913 * these unused top bits (that will be always be composed of zeros).
914 */
clz(uint32_t v)915 extern uint32_t __attribute__((overloadable)) clz(uint32_t v) {
916 return __builtin_clz(v);
917 }
clz(uint16_t v)918 extern uint16_t __attribute__((overloadable)) clz(uint16_t v) {
919 return __builtin_clz(v) - 16;
920 }
clz(uint8_t v)921 extern uint8_t __attribute__((overloadable)) clz(uint8_t v) {
922 return __builtin_clz(v) - 24;
923 }
clz(int32_t v)924 extern int32_t __attribute__((overloadable)) clz(int32_t v) {
925 return __builtin_clz(v);
926 }
clz(int16_t v)927 extern int16_t __attribute__((overloadable)) clz(int16_t v) {
928 return __builtin_clz(((uint32_t)v) & 0x0000ffff) - 16;
929 }
clz(int8_t v)930 extern int8_t __attribute__((overloadable)) clz(int8_t v) {
931 return __builtin_clz(((uint32_t)v) & 0x000000ff) - 24;
932 }
933
934
935 UIN_FUNC_IN(abs)
IN_FUNC_IN(clz)936 IN_FUNC_IN(clz)
937
938
939 // 6.11.4
940
941
942 extern float __attribute__((overloadable)) degrees(float radians) {
943 return radians * (180.f / M_PI);
944 }
degrees(float2 radians)945 extern float2 __attribute__((overloadable)) degrees(float2 radians) {
946 return radians * (180.f / M_PI);
947 }
degrees(float3 radians)948 extern float3 __attribute__((overloadable)) degrees(float3 radians) {
949 return radians * (180.f / M_PI);
950 }
degrees(float4 radians)951 extern float4 __attribute__((overloadable)) degrees(float4 radians) {
952 return radians * (180.f / M_PI);
953 }
954
mix(float start,float stop,float amount)955 extern float __attribute__((overloadable)) mix(float start, float stop, float amount) {
956 return start + (stop - start) * amount;
957 }
mix(float2 start,float2 stop,float2 amount)958 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float2 amount) {
959 return start + (stop - start) * amount;
960 }
mix(float3 start,float3 stop,float3 amount)961 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float3 amount) {
962 return start + (stop - start) * amount;
963 }
mix(float4 start,float4 stop,float4 amount)964 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float4 amount) {
965 return start + (stop - start) * amount;
966 }
mix(float2 start,float2 stop,float amount)967 extern float2 __attribute__((overloadable)) mix(float2 start, float2 stop, float amount) {
968 return start + (stop - start) * amount;
969 }
mix(float3 start,float3 stop,float amount)970 extern float3 __attribute__((overloadable)) mix(float3 start, float3 stop, float amount) {
971 return start + (stop - start) * amount;
972 }
mix(float4 start,float4 stop,float amount)973 extern float4 __attribute__((overloadable)) mix(float4 start, float4 stop, float amount) {
974 return start + (stop - start) * amount;
975 }
976
radians(float degrees)977 extern float __attribute__((overloadable)) radians(float degrees) {
978 return degrees * (M_PI / 180.f);
979 }
radians(float2 degrees)980 extern float2 __attribute__((overloadable)) radians(float2 degrees) {
981 return degrees * (M_PI / 180.f);
982 }
radians(float3 degrees)983 extern float3 __attribute__((overloadable)) radians(float3 degrees) {
984 return degrees * (M_PI / 180.f);
985 }
radians(float4 degrees)986 extern float4 __attribute__((overloadable)) radians(float4 degrees) {
987 return degrees * (M_PI / 180.f);
988 }
989
step(float edge,float v)990 extern float __attribute__((overloadable)) step(float edge, float v) {
991 return (v < edge) ? 0.f : 1.f;
992 }
step(float2 edge,float2 v)993 extern float2 __attribute__((overloadable)) step(float2 edge, float2 v) {
994 float2 r;
995 r.x = (v.x < edge.x) ? 0.f : 1.f;
996 r.y = (v.y < edge.y) ? 0.f : 1.f;
997 return r;
998 }
step(float3 edge,float3 v)999 extern float3 __attribute__((overloadable)) step(float3 edge, float3 v) {
1000 float3 r;
1001 r.x = (v.x < edge.x) ? 0.f : 1.f;
1002 r.y = (v.y < edge.y) ? 0.f : 1.f;
1003 r.z = (v.z < edge.z) ? 0.f : 1.f;
1004 return r;
1005 }
step(float4 edge,float4 v)1006 extern float4 __attribute__((overloadable)) step(float4 edge, float4 v) {
1007 float4 r;
1008 r.x = (v.x < edge.x) ? 0.f : 1.f;
1009 r.y = (v.y < edge.y) ? 0.f : 1.f;
1010 r.z = (v.z < edge.z) ? 0.f : 1.f;
1011 r.w = (v.w < edge.w) ? 0.f : 1.f;
1012 return r;
1013 }
step(float2 edge,float v)1014 extern float2 __attribute__((overloadable)) step(float2 edge, float v) {
1015 float2 r;
1016 r.x = (v < edge.x) ? 0.f : 1.f;
1017 r.y = (v < edge.y) ? 0.f : 1.f;
1018 return r;
1019 }
step(float3 edge,float v)1020 extern float3 __attribute__((overloadable)) step(float3 edge, float v) {
1021 float3 r;
1022 r.x = (v < edge.x) ? 0.f : 1.f;
1023 r.y = (v < edge.y) ? 0.f : 1.f;
1024 r.z = (v < edge.z) ? 0.f : 1.f;
1025 return r;
1026 }
step(float4 edge,float v)1027 extern float4 __attribute__((overloadable)) step(float4 edge, float v) {
1028 float4 r;
1029 r.x = (v < edge.x) ? 0.f : 1.f;
1030 r.y = (v < edge.y) ? 0.f : 1.f;
1031 r.z = (v < edge.z) ? 0.f : 1.f;
1032 r.w = (v < edge.w) ? 0.f : 1.f;
1033 return r;
1034 }
step(float edge,float2 v)1035 extern float2 __attribute__((overloadable)) step(float edge, float2 v) {
1036 float2 r;
1037 r.x = (v.x < edge) ? 0.f : 1.f;
1038 r.y = (v.y < edge) ? 0.f : 1.f;
1039 return r;
1040 }
step(float edge,float3 v)1041 extern float3 __attribute__((overloadable)) step(float edge, float3 v) {
1042 float3 r;
1043 r.x = (v.x < edge) ? 0.f : 1.f;
1044 r.y = (v.y < edge) ? 0.f : 1.f;
1045 r.z = (v.z < edge) ? 0.f : 1.f;
1046 return r;
1047 }
step(float edge,float4 v)1048 extern float4 __attribute__((overloadable)) step(float edge, float4 v) {
1049 float4 r;
1050 r.x = (v.x < edge) ? 0.f : 1.f;
1051 r.y = (v.y < edge) ? 0.f : 1.f;
1052 r.z = (v.z < edge) ? 0.f : 1.f;
1053 r.w = (v.w < edge) ? 0.f : 1.f;
1054 return r;
1055 }
1056
sign(float v)1057 extern float __attribute__((overloadable)) sign(float v) {
1058 if (v > 0) return 1.f;
1059 if (v < 0) return -1.f;
1060 return v;
1061 }
FN_FUNC_FN(sign)1062 FN_FUNC_FN(sign)
1063
1064
1065 // 6.11.5
1066 extern float3 __attribute__((overloadable)) cross(float3 lhs, float3 rhs) {
1067 float3 r;
1068 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1069 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1070 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1071 return r;
1072 }
1073
cross(float4 lhs,float4 rhs)1074 extern float4 __attribute__((overloadable)) cross(float4 lhs, float4 rhs) {
1075 float4 r;
1076 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1077 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1078 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1079 r.w = 0.f;
1080 return r;
1081 }
1082
1083 #if !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
1084 // These functions must be defined here if we are not using the SSE
1085 // implementation, which includes when we are built as part of the
1086 // debug runtime (libclcore_debug.bc) or compiling with debug info.
1087
dot(float lhs,float rhs)1088 extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
1089 return lhs * rhs;
1090 }
dot(float2 lhs,float2 rhs)1091 extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
1092 return lhs.x*rhs.x + lhs.y*rhs.y;
1093 }
dot(float3 lhs,float3 rhs)1094 extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
1095 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
1096 }
dot(float4 lhs,float4 rhs)1097 extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
1098 return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
1099 }
1100
length(float v)1101 extern float __attribute__((overloadable)) length(float v) {
1102 return fabs(v);
1103 }
length(float2 v)1104 extern float __attribute__((overloadable)) length(float2 v) {
1105 return sqrt(v.x*v.x + v.y*v.y);
1106 }
length(float3 v)1107 extern float __attribute__((overloadable)) length(float3 v) {
1108 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1109 }
length(float4 v)1110 extern float __attribute__((overloadable)) length(float4 v) {
1111 return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1112 }
1113
1114 #else
1115
1116 extern float __attribute__((overloadable)) length(float v);
1117 extern float __attribute__((overloadable)) length(float2 v);
1118 extern float __attribute__((overloadable)) length(float3 v);
1119 extern float __attribute__((overloadable)) length(float4 v);
1120
1121 #endif // !defined(ARCH_X86_HAVE_SSSE3) || defined(RS_DEBUG_RUNTIME) || defined(RS_G_RUNTIME)
1122
distance(float lhs,float rhs)1123 extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
1124 return length(lhs - rhs);
1125 }
distance(float2 lhs,float2 rhs)1126 extern float __attribute__((overloadable)) distance(float2 lhs, float2 rhs) {
1127 return length(lhs - rhs);
1128 }
distance(float3 lhs,float3 rhs)1129 extern float __attribute__((overloadable)) distance(float3 lhs, float3 rhs) {
1130 return length(lhs - rhs);
1131 }
distance(float4 lhs,float4 rhs)1132 extern float __attribute__((overloadable)) distance(float4 lhs, float4 rhs) {
1133 return length(lhs - rhs);
1134 }
1135
1136 /* For the normalization functions, vectors of length 0 should simply be
1137 * returned (i.e. all the components of that vector are 0).
1138 */
normalize(float v)1139 extern float __attribute__((overloadable)) normalize(float v) {
1140 if (v == 0.0f) {
1141 return 0.0f;
1142 } else if (v < 0.0f) {
1143 return -1.0f;
1144 } else {
1145 return 1.0f;
1146 }
1147 }
normalize(float2 v)1148 extern float2 __attribute__((overloadable)) normalize(float2 v) {
1149 float l = length(v);
1150 return l == 0.0f ? v : v / l;
1151 }
normalize(float3 v)1152 extern float3 __attribute__((overloadable)) normalize(float3 v) {
1153 float l = length(v);
1154 return l == 0.0f ? v : v / l;
1155 }
normalize(float4 v)1156 extern float4 __attribute__((overloadable)) normalize(float4 v) {
1157 float l = length(v);
1158 return l == 0.0f ? v : v / l;
1159 }
1160
half_sqrt(float v)1161 extern float __attribute__((overloadable)) half_sqrt(float v) {
1162 return sqrt(v);
1163 }
FN_FUNC_FN(half_sqrt)1164 FN_FUNC_FN(half_sqrt)
1165
1166 extern float __attribute__((overloadable)) fast_length(float v) {
1167 return fabs(v);
1168 }
fast_length(float2 v)1169 extern float __attribute__((overloadable)) fast_length(float2 v) {
1170 return half_sqrt(v.x*v.x + v.y*v.y);
1171 }
fast_length(float3 v)1172 extern float __attribute__((overloadable)) fast_length(float3 v) {
1173 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1174 }
fast_length(float4 v)1175 extern float __attribute__((overloadable)) fast_length(float4 v) {
1176 return half_sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1177 }
1178
fast_distance(float lhs,float rhs)1179 extern float __attribute__((overloadable)) fast_distance(float lhs, float rhs) {
1180 return fast_length(lhs - rhs);
1181 }
fast_distance(float2 lhs,float2 rhs)1182 extern float __attribute__((overloadable)) fast_distance(float2 lhs, float2 rhs) {
1183 return fast_length(lhs - rhs);
1184 }
fast_distance(float3 lhs,float3 rhs)1185 extern float __attribute__((overloadable)) fast_distance(float3 lhs, float3 rhs) {
1186 return fast_length(lhs - rhs);
1187 }
fast_distance(float4 lhs,float4 rhs)1188 extern float __attribute__((overloadable)) fast_distance(float4 lhs, float4 rhs) {
1189 return fast_length(lhs - rhs);
1190 }
1191
1192 extern float __attribute__((overloadable)) half_rsqrt(float);
1193
1194 /* For the normalization functions, vectors of length 0 should simply be
1195 * returned (i.e. all the components of that vector are 0).
1196 */
fast_normalize(float v)1197 extern float __attribute__((overloadable)) fast_normalize(float v) {
1198 if (v == 0.0f) {
1199 return 0.0f;
1200 } else if (v < 0.0f) {
1201 return -1.0f;
1202 } else {
1203 return 1.0f;
1204 }
1205 }
1206 // If the length is 0, then rlength should be NaN.
fast_normalize(float2 v)1207 extern float2 __attribute__((overloadable)) fast_normalize(float2 v) {
1208 float rlength = half_rsqrt(v.x*v.x + v.y*v.y);
1209 return (rlength == rlength) ? v * rlength : v;
1210 }
fast_normalize(float3 v)1211 extern float3 __attribute__((overloadable)) fast_normalize(float3 v) {
1212 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z);
1213 return (rlength == rlength) ? v * rlength : v;
1214 }
fast_normalize(float4 v)1215 extern float4 __attribute__((overloadable)) fast_normalize(float4 v) {
1216 float rlength = half_rsqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
1217 return (rlength == rlength) ? v * rlength : v;
1218 }
1219
half_recip(float v)1220 extern float __attribute__((overloadable)) half_recip(float v) {
1221 return 1.f / v;
1222 }
1223
1224 /*
1225 extern float __attribute__((overloadable)) approx_atan(float x) {
1226 if (x == 0.f)
1227 return 0.f;
1228 if (x < 0.f)
1229 return -1.f * approx_atan(-1.f * x);
1230 if (x > 1.f)
1231 return M_PI_2 - approx_atan(approx_recip(x));
1232 return x * approx_recip(1.f + 0.28f * x*x);
1233 }
1234 FN_FUNC_FN(approx_atan)
1235 */
1236
1237 typedef union
1238 {
1239 float fv;
1240 int32_t iv;
1241 } ieee_float_shape_type;
1242
1243 /* Get a 32 bit int from a float. */
1244
1245 #define GET_FLOAT_WORD(i,d) \
1246 do { \
1247 ieee_float_shape_type gf_u; \
1248 gf_u.fv = (d); \
1249 (i) = gf_u.iv; \
1250 } while (0)
1251
1252 /* Set a float from a 32 bit int. */
1253
1254 #define SET_FLOAT_WORD(d,i) \
1255 do { \
1256 ieee_float_shape_type sf_u; \
1257 sf_u.iv = (i); \
1258 (d) = sf_u.fv; \
1259 } while (0)
1260
1261
1262
1263 // Valid -125 to 125
native_exp2(float v)1264 extern float __attribute__((overloadable)) native_exp2(float v) {
1265 int32_t iv = (int)v;
1266 int32_t x = iv + (iv >> 31); // ~floor(v)
1267 float r = (v - x);
1268
1269 float fo;
1270 SET_FLOAT_WORD(fo, (x + 127) << 23);
1271
1272 r *= 0.694f; // ~ log(e) / log(2)
1273 float r2 = r*r;
1274 float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1275 return fo * adj;
1276 }
1277
native_exp2(float2 v)1278 extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
1279 int2 iv = convert_int2(v);
1280 int2 x = iv + (iv >> (int2)31);//floor(v);
1281 float2 r = (v - convert_float2(x));
1282
1283 x += 127;
1284
1285 float2 fo = (float2)(x << (int2)23);
1286
1287 r *= 0.694f; // ~ log(e) / log(2)
1288 float2 r2 = r*r;
1289 float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1290 return fo * adj;
1291 }
1292
native_exp2(float4 v)1293 extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
1294 int4 iv = convert_int4(v);
1295 int4 x = iv + (iv >> (int4)31);//floor(v);
1296 float4 r = (v - convert_float4(x));
1297
1298 x += 127;
1299
1300 float4 fo = (float4)(x << (int4)23);
1301
1302 r *= 0.694f; // ~ log(e) / log(2)
1303 float4 r2 = r*r;
1304 float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
1305 return fo * adj;
1306 }
1307
native_exp2(float3 v)1308 extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
1309 float4 t = 1.f;
1310 t.xyz = v;
1311 return native_exp2(t).xyz;
1312 }
1313
1314
native_exp(float v)1315 extern float __attribute__((overloadable)) native_exp(float v) {
1316 return native_exp2(v * 1.442695041f);
1317 }
native_exp(float2 v)1318 extern float2 __attribute__((overloadable)) native_exp(float2 v) {
1319 return native_exp2(v * 1.442695041f);
1320 }
native_exp(float3 v)1321 extern float3 __attribute__((overloadable)) native_exp(float3 v) {
1322 return native_exp2(v * 1.442695041f);
1323 }
native_exp(float4 v)1324 extern float4 __attribute__((overloadable)) native_exp(float4 v) {
1325 return native_exp2(v * 1.442695041f);
1326 }
1327
native_exp10(float v)1328 extern float __attribute__((overloadable)) native_exp10(float v) {
1329 return native_exp2(v * 3.321928095f);
1330 }
native_exp10(float2 v)1331 extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
1332 return native_exp2(v * 3.321928095f);
1333 }
native_exp10(float3 v)1334 extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
1335 return native_exp2(v * 3.321928095f);
1336 }
native_exp10(float4 v)1337 extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
1338 return native_exp2(v * 3.321928095f);
1339 }
1340
native_log2(float v)1341 extern float __attribute__((overloadable)) native_log2(float v) {
1342 int32_t ibits;
1343 GET_FLOAT_WORD(ibits, v);
1344
1345 int32_t e = (ibits >> 23) & 0xff;
1346
1347 ibits &= 0x7fffff;
1348 ibits |= 127 << 23;
1349
1350 float ir;
1351 SET_FLOAT_WORD(ir, ibits);
1352 ir -= 1.5f;
1353 float ir2 = ir*ir;
1354 float adj2 = (0.405465108f / 0.693147181f) +
1355 ((0.666666667f / 0.693147181f) * ir) -
1356 ((0.222222222f / 0.693147181f) * ir2) +
1357 ((0.098765432f / 0.693147181f) * ir*ir2) -
1358 ((0.049382716f / 0.693147181f) * ir2*ir2) +
1359 ((0.026337449f / 0.693147181f) * ir*ir2*ir2) -
1360 ((0.014631916f / 0.693147181f) * ir2*ir2*ir2);
1361 return (float)(e - 127) + adj2;
1362 }
native_log2(float2 v)1363 extern float2 __attribute__((overloadable)) native_log2(float2 v) {
1364 float2 v2 = {native_log2(v.x), native_log2(v.y)};
1365 return v2;
1366 }
native_log2(float3 v)1367 extern float3 __attribute__((overloadable)) native_log2(float3 v) {
1368 float3 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z)};
1369 return v2;
1370 }
native_log2(float4 v)1371 extern float4 __attribute__((overloadable)) native_log2(float4 v) {
1372 float4 v2 = {native_log2(v.x), native_log2(v.y), native_log2(v.z), native_log2(v.w)};
1373 return v2;
1374 }
1375
native_log(float v)1376 extern float __attribute__((overloadable)) native_log(float v) {
1377 return native_log2(v) * (1.f / 1.442695041f);
1378 }
native_log(float2 v)1379 extern float2 __attribute__((overloadable)) native_log(float2 v) {
1380 return native_log2(v) * (1.f / 1.442695041f);
1381 }
native_log(float3 v)1382 extern float3 __attribute__((overloadable)) native_log(float3 v) {
1383 return native_log2(v) * (1.f / 1.442695041f);
1384 }
native_log(float4 v)1385 extern float4 __attribute__((overloadable)) native_log(float4 v) {
1386 return native_log2(v) * (1.f / 1.442695041f);
1387 }
1388
native_log10(float v)1389 extern float __attribute__((overloadable)) native_log10(float v) {
1390 return native_log2(v) * (1.f / 3.321928095f);
1391 }
native_log10(float2 v)1392 extern float2 __attribute__((overloadable)) native_log10(float2 v) {
1393 return native_log2(v) * (1.f / 3.321928095f);
1394 }
native_log10(float3 v)1395 extern float3 __attribute__((overloadable)) native_log10(float3 v) {
1396 return native_log2(v) * (1.f / 3.321928095f);
1397 }
native_log10(float4 v)1398 extern float4 __attribute__((overloadable)) native_log10(float4 v) {
1399 return native_log2(v) * (1.f / 3.321928095f);
1400 }
1401
1402
native_powr(float v,float y)1403 extern float __attribute__((overloadable)) native_powr(float v, float y) {
1404 float v2 = native_log2(v);
1405 v2 = fmax(v2 * y, -125.f);
1406 return native_exp2(v2);
1407 }
native_powr(float2 v,float2 y)1408 extern float2 __attribute__((overloadable)) native_powr(float2 v, float2 y) {
1409 float2 v2 = native_log2(v);
1410 v2 = fmax(v2 * y, -125.f);
1411 return native_exp2(v2);
1412 }
native_powr(float3 v,float3 y)1413 extern float3 __attribute__((overloadable)) native_powr(float3 v, float3 y) {
1414 float3 v2 = native_log2(v);
1415 v2 = fmax(v2 * y, -125.f);
1416 return native_exp2(v2);
1417 }
native_powr(float4 v,float4 y)1418 extern float4 __attribute__((overloadable)) native_powr(float4 v, float4 y) {
1419 float4 v2 = native_log2(v);
1420 v2 = fmax(v2 * y, -125.f);
1421 return native_exp2(v2);
1422 }
1423
min(double v1,double v2)1424 extern double __attribute__((overloadable)) min(double v1, double v2) {
1425 return v1 < v2 ? v1 : v2;
1426 }
1427
min(double2 v1,double2 v2)1428 extern double2 __attribute__((overloadable)) min(double2 v1, double2 v2) {
1429 double2 r;
1430 r.x = v1.x < v2.x ? v1.x : v2.x;
1431 r.y = v1.y < v2.y ? v1.y : v2.y;
1432 return r;
1433 }
1434
min(double3 v1,double3 v2)1435 extern double3 __attribute__((overloadable)) min(double3 v1, double3 v2) {
1436 double3 r;
1437 r.x = v1.x < v2.x ? v1.x : v2.x;
1438 r.y = v1.y < v2.y ? v1.y : v2.y;
1439 r.z = v1.z < v2.z ? v1.z : v2.z;
1440 return r;
1441 }
1442
min(double4 v1,double4 v2)1443 extern double4 __attribute__((overloadable)) min(double4 v1, double4 v2) {
1444 double4 r;
1445 r.x = v1.x < v2.x ? v1.x : v2.x;
1446 r.y = v1.y < v2.y ? v1.y : v2.y;
1447 r.z = v1.z < v2.z ? v1.z : v2.z;
1448 r.w = v1.w < v2.w ? v1.w : v2.w;
1449 return r;
1450 }
1451
min(long v1,long v2)1452 extern long __attribute__((overloadable)) min(long v1, long v2) {
1453 return v1 < v2 ? v1 : v2;
1454 }
min(long2 v1,long2 v2)1455 extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
1456 long2 r;
1457 r.x = v1.x < v2.x ? v1.x : v2.x;
1458 r.y = v1.y < v2.y ? v1.y : v2.y;
1459 return r;
1460 }
min(long3 v1,long3 v2)1461 extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
1462 long3 r;
1463 r.x = v1.x < v2.x ? v1.x : v2.x;
1464 r.y = v1.y < v2.y ? v1.y : v2.y;
1465 r.z = v1.z < v2.z ? v1.z : v2.z;
1466 return r;
1467 }
min(long4 v1,long4 v2)1468 extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
1469 long4 r;
1470 r.x = v1.x < v2.x ? v1.x : v2.x;
1471 r.y = v1.y < v2.y ? v1.y : v2.y;
1472 r.z = v1.z < v2.z ? v1.z : v2.z;
1473 r.w = v1.w < v2.w ? v1.w : v2.w;
1474 return r;
1475 }
1476
min(ulong v1,ulong v2)1477 extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
1478 return v1 < v2 ? v1 : v2;
1479 }
min(ulong2 v1,ulong2 v2)1480 extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
1481 ulong2 r;
1482 r.x = v1.x < v2.x ? v1.x : v2.x;
1483 r.y = v1.y < v2.y ? v1.y : v2.y;
1484 return r;
1485 }
min(ulong3 v1,ulong3 v2)1486 extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
1487 ulong3 r;
1488 r.x = v1.x < v2.x ? v1.x : v2.x;
1489 r.y = v1.y < v2.y ? v1.y : v2.y;
1490 r.z = v1.z < v2.z ? v1.z : v2.z;
1491 return r;
1492 }
min(ulong4 v1,ulong4 v2)1493 extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
1494 ulong4 r;
1495 r.x = v1.x < v2.x ? v1.x : v2.x;
1496 r.y = v1.y < v2.y ? v1.y : v2.y;
1497 r.z = v1.z < v2.z ? v1.z : v2.z;
1498 r.w = v1.w < v2.w ? v1.w : v2.w;
1499 return r;
1500 }
1501
max(double v1,double v2)1502 extern double __attribute__((overloadable)) max(double v1, double v2) {
1503 return v1 > v2 ? v1 : v2;
1504 }
1505
max(double2 v1,double2 v2)1506 extern double2 __attribute__((overloadable)) max(double2 v1, double2 v2) {
1507 double2 r;
1508 r.x = v1.x > v2.x ? v1.x : v2.x;
1509 r.y = v1.y > v2.y ? v1.y : v2.y;
1510 return r;
1511 }
1512
max(double3 v1,double3 v2)1513 extern double3 __attribute__((overloadable)) max(double3 v1, double3 v2) {
1514 double3 r;
1515 r.x = v1.x > v2.x ? v1.x : v2.x;
1516 r.y = v1.y > v2.y ? v1.y : v2.y;
1517 r.z = v1.z > v2.z ? v1.z : v2.z;
1518 return r;
1519 }
1520
max(double4 v1,double4 v2)1521 extern double4 __attribute__((overloadable)) max(double4 v1, double4 v2) {
1522 double4 r;
1523 r.x = v1.x > v2.x ? v1.x : v2.x;
1524 r.y = v1.y > v2.y ? v1.y : v2.y;
1525 r.z = v1.z > v2.z ? v1.z : v2.z;
1526 r.w = v1.w > v2.w ? v1.w : v2.w;
1527 return r;
1528 }
1529
max(long v1,long v2)1530 extern long __attribute__((overloadable)) max(long v1, long v2) {
1531 return v1 > v2 ? v1 : v2;
1532 }
max(long2 v1,long2 v2)1533 extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
1534 long2 r;
1535 r.x = v1.x > v2.x ? v1.x : v2.x;
1536 r.y = v1.y > v2.y ? v1.y : v2.y;
1537 return r;
1538 }
max(long3 v1,long3 v2)1539 extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
1540 long3 r;
1541 r.x = v1.x > v2.x ? v1.x : v2.x;
1542 r.y = v1.y > v2.y ? v1.y : v2.y;
1543 r.z = v1.z > v2.z ? v1.z : v2.z;
1544 return r;
1545 }
max(long4 v1,long4 v2)1546 extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
1547 long4 r;
1548 r.x = v1.x > v2.x ? v1.x : v2.x;
1549 r.y = v1.y > v2.y ? v1.y : v2.y;
1550 r.z = v1.z > v2.z ? v1.z : v2.z;
1551 r.w = v1.w > v2.w ? v1.w : v2.w;
1552 return r;
1553 }
1554
max(ulong v1,ulong v2)1555 extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
1556 return v1 > v2 ? v1 : v2;
1557 }
max(ulong2 v1,ulong2 v2)1558 extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
1559 ulong2 r;
1560 r.x = v1.x > v2.x ? v1.x : v2.x;
1561 r.y = v1.y > v2.y ? v1.y : v2.y;
1562 return r;
1563 }
max(ulong3 v1,ulong3 v2)1564 extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
1565 ulong3 r;
1566 r.x = v1.x > v2.x ? v1.x : v2.x;
1567 r.y = v1.y > v2.y ? v1.y : v2.y;
1568 r.z = v1.z > v2.z ? v1.z : v2.z;
1569 return r;
1570 }
max(ulong4 v1,ulong4 v2)1571 extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
1572 ulong4 r;
1573 r.x = v1.x > v2.x ? v1.x : v2.x;
1574 r.y = v1.y > v2.y ? v1.y : v2.y;
1575 r.z = v1.z > v2.z ? v1.z : v2.z;
1576 r.w = v1.w > v2.w ? v1.w : v2.w;
1577 return r;
1578 }
1579
1580 #define THUNK_NATIVE_F(fn) \
1581 float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \
1582 float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \
1583 float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \
1584 float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);}
1585
1586 #define THUNK_NATIVE_F_F(fn) \
1587 float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \
1588 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \
1589 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \
1590 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);}
1591
1592 #define THUNK_NATIVE_F_FP(fn) \
1593 float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \
1594 float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \
1595 float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \
1596 float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);}
1597
1598 #define THUNK_NATIVE_F_I(fn) \
1599 float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \
1600 float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \
1601 float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \
1602 float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);}
1603
1604 THUNK_NATIVE_F(acos)
THUNK_NATIVE_F(acosh)1605 THUNK_NATIVE_F(acosh)
1606 THUNK_NATIVE_F(acospi)
1607 THUNK_NATIVE_F(asin)
1608 THUNK_NATIVE_F(asinh)
1609 THUNK_NATIVE_F(asinpi)
1610 THUNK_NATIVE_F(atan)
1611 THUNK_NATIVE_F_F(atan2)
1612 THUNK_NATIVE_F(atanh)
1613 THUNK_NATIVE_F(atanpi)
1614 THUNK_NATIVE_F_F(atan2pi)
1615 THUNK_NATIVE_F(cbrt)
1616 THUNK_NATIVE_F(cos)
1617 THUNK_NATIVE_F(cosh)
1618 THUNK_NATIVE_F(cospi)
1619 THUNK_NATIVE_F(expm1)
1620 THUNK_NATIVE_F_F(hypot)
1621 THUNK_NATIVE_F(log1p)
1622 THUNK_NATIVE_F_I(rootn)
1623 THUNK_NATIVE_F(rsqrt)
1624 THUNK_NATIVE_F(sqrt)
1625 THUNK_NATIVE_F(sin)
1626 THUNK_NATIVE_F_FP(sincos)
1627 THUNK_NATIVE_F(sinh)
1628 THUNK_NATIVE_F(sinpi)
1629 THUNK_NATIVE_F(tan)
1630 THUNK_NATIVE_F(tanh)
1631 THUNK_NATIVE_F(tanpi)
1632
1633 #undef THUNK_NATIVE_F
1634 #undef THUNK_NATIVE_F_F
1635 #undef THUNK_NATIVE_F_I
1636 #undef THUNK_NATIVE_F_FP
1637
1638 float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);}
native_normalize(float2 v)1639 float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);}
native_normalize(float3 v)1640 float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);}
native_normalize(float4 v)1641 float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);}
1642
native_distance(float v1,float v2)1643 float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);}
native_distance(float2 v1,float2 v2)1644 float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);}
native_distance(float3 v1,float3 v2)1645 float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);}
native_distance(float4 v1,float4 v2)1646 float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);}
1647
native_length(float v)1648 float __attribute__((overloadable)) native_length(float v) { return fast_length(v);}
native_length(float2 v)1649 float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);}
native_length(float3 v)1650 float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);}
native_length(float4 v)1651 float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);}
1652
native_divide(float v1,float v2)1653 float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;}
native_divide(float2 v1,float2 v2)1654 float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;}
native_divide(float3 v1,float3 v2)1655 float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;}
native_divide(float4 v1,float4 v2)1656 float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;}
1657
native_recip(float v)1658 float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;}
native_recip(float2 v)1659 float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;}
native_recip(float3 v)1660 float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;}
native_recip(float4 v)1661 float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;}
1662
1663
1664
1665
1666
1667 #undef FN_FUNC_FN
1668 #undef IN_FUNC_FN
1669 #undef FN_FUNC_FN_FN
1670 #undef FN_FUNC_FN_F
1671 #undef FN_FUNC_FN_IN
1672 #undef FN_FUNC_FN_I
1673 #undef FN_FUNC_FN_PFN
1674 #undef FN_FUNC_FN_PIN
1675 #undef FN_FUNC_FN_FN_FN
1676 #undef FN_FUNC_FN_FN_PIN
1677 #undef XN_FUNC_YN
1678 #undef UIN_FUNC_IN
1679 #undef IN_FUNC_IN
1680 #undef XN_FUNC_XN_XN_BODY
1681 #undef IN_FUNC_IN_IN_BODY
1682
1683 static const unsigned short kHalfPositiveInfinity = 0x7c00;
1684
1685 /* Define f16 functions of the form
1686 * HN output = fn(HN input)
1687 * where HN is scalar or vector half type
1688 */
1689 #define HN_FUNC_HN(fn) \
1690 extern half __attribute__((overloadable)) fn(half h) { \
1691 return (half) fn((float) h); \
1692 } \
1693 extern half2 __attribute__((overloadable)) fn(half2 v) { \
1694 return convert_half2(fn(convert_float2(v))); \
1695 } \
1696 extern half3 __attribute__((overloadable)) fn(half3 v) { \
1697 return convert_half3(fn(convert_float3(v))); \
1698 } \
1699 extern half4 __attribute__((overloadable)) fn(half4 v) { \
1700 return convert_half4(fn(convert_float4(v))); \
1701 }
1702
1703 /* Define f16 functions of the form
1704 * HN output = fn(HN input1, HN input2)
1705 * where HN is scalar or vector half type
1706 */
1707 #define HN_FUNC_HN_HN(fn) \
1708 extern half __attribute__((overloadable)) fn(half h1, half h2) { \
1709 return (half) fn((float) h1, (float) h2); \
1710 } \
1711 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \
1712 return convert_half2(fn(convert_float2(v1), \
1713 convert_float2(v2))); \
1714 } \
1715 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \
1716 return convert_half3(fn(convert_float3(v1), \
1717 convert_float3(v2))); \
1718 } \
1719 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \
1720 return convert_half4(fn(convert_float4(v1), \
1721 convert_float4(v2))); \
1722 }
1723
1724 /* Define f16 functions of the form
1725 * HN output = fn(HN input1, half input2)
1726 * where HN is scalar or vector half type
1727 */
1728 #define HN_FUNC_HN_H(fn) \
1729 extern half2 __attribute__((overloadable)) fn(half2 v1, half v2) { \
1730 return convert_half2(fn(convert_float2(v1), (float) v2)); \
1731 } \
1732 extern half3 __attribute__((overloadable)) fn(half3 v1, half v2) { \
1733 return convert_half3(fn(convert_float3(v1), (float) v2)); \
1734 } \
1735 extern half4 __attribute__((overloadable)) fn(half4 v1, half v2) { \
1736 return convert_half4(fn(convert_float4(v1), (float) v2)); \
1737 }
1738
1739 /* Define f16 functions of the form
1740 * HN output = fn(HN input1, HN input2, HN input3)
1741 * where HN is scalar or vector half type
1742 */
1743 #define HN_FUNC_HN_HN_HN(fn) \
1744 extern half __attribute__((overloadable)) fn(half h1, half h2, half h3) { \
1745 return (half) fn((float) h1, (float) h2, (float) h3); \
1746 } \
1747 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2, half2 v3) { \
1748 return convert_half2(fn(convert_float2(v1), \
1749 convert_float2(v2), \
1750 convert_float2(v3))); \
1751 } \
1752 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2, half3 v3) { \
1753 return convert_half3(fn(convert_float3(v1), \
1754 convert_float3(v2), \
1755 convert_float3(v3))); \
1756 } \
1757 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2, half4 v3) { \
1758 return convert_half4(fn(convert_float4(v1), \
1759 convert_float4(v2), \
1760 convert_float4(v3))); \
1761 }
1762
1763 /* Define f16 functions of the form
1764 * HN output = fn(HN input1, IN input2)
1765 * where HN is scalar or vector half type and IN the equivalent integer type
1766 * of same vector length.
1767 */
1768 #define HN_FUNC_HN_IN(fn) \
1769 extern half __attribute__((overloadable)) fn(half h1, int v) { \
1770 return (half) fn((float) h1, v); \
1771 } \
1772 extern half2 __attribute__((overloadable)) fn(half2 v1, int2 v2) { \
1773 return convert_half2(fn(convert_float2(v1), v2)); \
1774 } \
1775 extern half3 __attribute__((overloadable)) fn(half3 v1, int3 v2) { \
1776 return convert_half3(fn(convert_float3(v1), v2)); \
1777 } \
1778 extern half4 __attribute__((overloadable)) fn(half4 v1, int4 v2) { \
1779 return convert_half4(fn(convert_float4(v1), v2)); \
1780 }
1781
1782 /* Define f16 functions of the form
1783 * half output = fn(HN input1)
1784 * where HN is a scalar or vector half type.
1785 */
1786 #define H_FUNC_HN(fn) \
1787 extern half __attribute__((overloadable)) fn(half h) { \
1788 return (half) fn((float) h); \
1789 } \
1790 extern half __attribute__((overloadable)) fn(half2 v) { \
1791 return fn(convert_float2(v)); \
1792 } \
1793 extern half __attribute__((overloadable)) fn(half3 v) { \
1794 return fn(convert_float3(v)); \
1795 } \
1796 extern half __attribute__((overloadable)) fn(half4 v) { \
1797 return fn(convert_float4(v)); \
1798 }
1799
1800 /* Define f16 functions of the form
1801 * half output = fn(HN input1, HN input2)
1802 * where HN is a scalar or vector half type.
1803 */
1804 #define H_FUNC_HN_HN(fn) \
1805 extern half __attribute__((overloadable)) fn(half h1, half h2) { \
1806 return (half) fn((float) h1, (float) h2); \
1807 } \
1808 extern half __attribute__((overloadable)) fn(half2 v1, half2 v2) { \
1809 return fn(convert_float2(v1), convert_float2(v2)); \
1810 } \
1811 extern half __attribute__((overloadable)) fn(half3 v1, half3 v2) { \
1812 return fn(convert_float3(v1), convert_float3(v2)); \
1813 } \
1814 extern half __attribute__((overloadable)) fn(half4 v1, half4 v2) { \
1815 return fn(convert_float4(v1), convert_float4(v2)); \
1816 }
1817
1818 #define SCALARIZE_HN_FUNC_HN_PHN(fnc) \
1819 extern half2 __attribute__((overloadable)) fnc(half2 v1, half2 *v2) { \
1820 half2 ret; \
1821 half t[2]; \
1822 ret.x = fnc(v1.x, &t[0]); \
1823 ret.y = fnc(v1.y, &t[1]); \
1824 v2->x = t[0]; \
1825 v2->y = t[1]; \
1826 return ret; \
1827 } \
1828 extern half3 __attribute__((overloadable)) fnc(half3 v1, half3 *v2) { \
1829 half3 ret; \
1830 half t[3]; \
1831 ret.x = fnc(v1.x, &t[0]); \
1832 ret.y = fnc(v1.y, &t[1]); \
1833 ret.z = fnc(v1.z, &t[2]); \
1834 v2->x = t[0]; \
1835 v2->y = t[1]; \
1836 v2->z = t[2]; \
1837 return ret; \
1838 } \
1839 extern half4 __attribute__((overloadable)) fnc(half4 v1, half4 *v2) { \
1840 half4 ret; \
1841 half t[4]; \
1842 ret.x = fnc(v1.x, &t[0]); \
1843 ret.y = fnc(v1.y, &t[1]); \
1844 ret.z = fnc(v1.z, &t[2]); \
1845 ret.w = fnc(v1.w, &t[3]); \
1846 v2->x = t[0]; \
1847 v2->y = t[1]; \
1848 v2->z = t[2]; \
1849 v2->w = t[3]; \
1850 return ret; \
1851 }
1852
1853 /* Define f16 functions of the form
1854 * HN output = fn(HN input1, HN input2)
1855 * where HN is a vector half type. The functions are defined to call the
1856 * scalar function of the same name.
1857 */
1858 #define SCALARIZE_HN_FUNC_HN_HN(fn) \
1859 extern half2 __attribute__((overloadable)) fn(half2 v1, half2 v2) { \
1860 half2 ret; \
1861 ret.x = fn(v1.x, v2.x); \
1862 ret.y = fn(v1.y, v2.y); \
1863 return ret; \
1864 } \
1865 extern half3 __attribute__((overloadable)) fn(half3 v1, half3 v2) { \
1866 half3 ret; \
1867 ret.x = fn(v1.x, v2.x); \
1868 ret.y = fn(v1.y, v2.y); \
1869 ret.z = fn(v1.z, v2.z); \
1870 return ret; \
1871 } \
1872 extern half4 __attribute__((overloadable)) fn(half4 v1, half4 v2) { \
1873 half4 ret; \
1874 ret.x = fn(v1.x, v2.x); \
1875 ret.y = fn(v1.y, v2.y); \
1876 ret.z = fn(v1.z, v2.z); \
1877 ret.w = fn(v1.w, v2.w); \
1878 return ret; \
1879 } \
1880
1881 HN_FUNC_HN(acos);
1882 HN_FUNC_HN(acosh);
1883 HN_FUNC_HN(acospi);
1884 HN_FUNC_HN(asin);
1885 HN_FUNC_HN(asinh);
1886 HN_FUNC_HN(asinpi);
1887 HN_FUNC_HN(atan);
1888 HN_FUNC_HN(atanh);
1889 HN_FUNC_HN(atanpi);
1890 HN_FUNC_HN_HN(atan2);
1891 HN_FUNC_HN_HN(atan2pi);
1892
1893 HN_FUNC_HN(cbrt);
1894 HN_FUNC_HN(ceil);
1895
1896 extern half __attribute__((overloadable)) copysign(half x, half y);
1897 SCALARIZE_HN_FUNC_HN_HN(copysign);
1898
1899 HN_FUNC_HN(cos);
1900 HN_FUNC_HN(cosh);
1901 HN_FUNC_HN(cospi);
1902
cross(half3 lhs,half3 rhs)1903 extern half3 __attribute__((overloadable)) cross(half3 lhs, half3 rhs) {
1904 half3 r;
1905 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1906 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1907 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1908 return r;
1909 }
1910
cross(half4 lhs,half4 rhs)1911 extern half4 __attribute__((overloadable)) cross(half4 lhs, half4 rhs) {
1912 half4 r;
1913 r.x = lhs.y * rhs.z - lhs.z * rhs.y;
1914 r.y = lhs.z * rhs.x - lhs.x * rhs.z;
1915 r.z = lhs.x * rhs.y - lhs.y * rhs.x;
1916 r.w = 0.f;
1917 return r;
1918 }
1919
1920 HN_FUNC_HN(degrees);
1921 H_FUNC_HN_HN(distance);
1922 H_FUNC_HN_HN(dot);
1923
1924 HN_FUNC_HN(erf);
1925 HN_FUNC_HN(erfc);
1926 HN_FUNC_HN(exp);
1927 HN_FUNC_HN(exp10);
1928 HN_FUNC_HN(exp2);
1929 HN_FUNC_HN(expm1);
1930
1931 HN_FUNC_HN(fabs);
1932 HN_FUNC_HN_HN(fdim);
1933 HN_FUNC_HN(floor);
1934 HN_FUNC_HN_HN_HN(fma);
1935 HN_FUNC_HN_HN(fmax);
1936 HN_FUNC_HN_H(fmax);
1937 HN_FUNC_HN_HN(fmin);
1938 HN_FUNC_HN_H(fmin);
1939 HN_FUNC_HN_HN(fmod);
1940
fract(half v,half * iptr)1941 extern half __attribute__((overloadable)) fract(half v, half *iptr) {
1942 // maxLessThanOne = 0.99951171875, the largest value < 1.0
1943 half maxLessThanOne;
1944 SET_HALF_WORD(maxLessThanOne, 0x3bff);
1945
1946 int i = (int) floor(v);
1947 if (iptr) {
1948 *iptr = i;
1949 }
1950 // return v - floor(v), if strictly less than one
1951 return fmin(v - i, maxLessThanOne);
1952 }
1953
1954 SCALARIZE_HN_FUNC_HN_PHN(fract);
1955
fract(half v)1956 extern half __attribute__((const, overloadable)) fract(half v) {
1957 half unused;
1958 return fract(v, &unused);
1959 }
1960
fract(half2 v)1961 extern half2 __attribute__((const, overloadable)) fract(half2 v) {
1962 half2 unused;
1963 return fract(v, &unused);
1964 }
1965
fract(half3 v)1966 extern half3 __attribute__((const, overloadable)) fract(half3 v) {
1967 half3 unused;
1968 return fract(v, &unused);
1969 }
1970
fract(half4 v)1971 extern half4 __attribute__((const, overloadable)) fract(half4 v) {
1972 half4 unused;
1973 return fract(v, &unused);
1974 }
1975
1976 extern half __attribute__((overloadable)) frexp(half x, int *eptr);
1977
frexp(half2 v1,int2 * eptr)1978 extern half2 __attribute__((overloadable)) frexp(half2 v1, int2 *eptr) {
1979 half2 ret;
1980 int e[2];
1981 ret.x = frexp(v1.x, &e[0]);
1982 ret.y = frexp(v1.y, &e[1]);
1983 eptr->x = e[0];
1984 eptr->y = e[1];
1985 return ret;
1986 }
1987
frexp(half3 v1,int3 * eptr)1988 extern half3 __attribute__((overloadable)) frexp(half3 v1, int3 *eptr) {
1989 half3 ret;
1990 int e[3];
1991 ret.x = frexp(v1.x, &e[0]);
1992 ret.y = frexp(v1.y, &e[1]);
1993 ret.z = frexp(v1.z, &e[2]);
1994 eptr->x = e[0];
1995 eptr->y = e[1];
1996 eptr->z = e[2];
1997 return ret;
1998 }
1999
frexp(half4 v1,int4 * eptr)2000 extern half4 __attribute__((overloadable)) frexp(half4 v1, int4 *eptr) {
2001 half4 ret;
2002 int e[4];
2003 ret.x = frexp(v1.x, &e[0]);
2004 ret.y = frexp(v1.y, &e[1]);
2005 ret.z = frexp(v1.z, &e[2]);
2006 ret.w = frexp(v1.w, &e[3]);
2007 eptr->x = e[0];
2008 eptr->y = e[1];
2009 eptr->z = e[2];
2010 eptr->w = e[3];
2011 return ret;
2012 }
2013
2014 HN_FUNC_HN_HN(hypot);
2015
2016 extern int __attribute__((overloadable)) ilogb(half x);
2017
ilogb(half2 v)2018 extern int2 __attribute__((overloadable)) ilogb(half2 v) {
2019 int2 ret;
2020 ret.x = ilogb(v.x);
2021 ret.y = ilogb(v.y);
2022 return ret;
2023 }
ilogb(half3 v)2024 extern int3 __attribute__((overloadable)) ilogb(half3 v) {
2025 int3 ret;
2026 ret.x = ilogb(v.x);
2027 ret.y = ilogb(v.y);
2028 ret.z = ilogb(v.z);
2029 return ret;
2030 }
ilogb(half4 v)2031 extern int4 __attribute__((overloadable)) ilogb(half4 v) {
2032 int4 ret;
2033 ret.x = ilogb(v.x);
2034 ret.y = ilogb(v.y);
2035 ret.z = ilogb(v.z);
2036 ret.w = ilogb(v.w);
2037 return ret;
2038 }
2039
2040 HN_FUNC_HN_IN(ldexp);
ldexp(half2 v,int exponent)2041 extern half2 __attribute__((overloadable)) ldexp(half2 v, int exponent) {
2042 return convert_half2(ldexp(convert_float2(v), exponent));
2043 }
ldexp(half3 v,int exponent)2044 extern half3 __attribute__((overloadable)) ldexp(half3 v, int exponent) {
2045 return convert_half3(ldexp(convert_float3(v), exponent));
2046 }
ldexp(half4 v,int exponent)2047 extern half4 __attribute__((overloadable)) ldexp(half4 v, int exponent) {
2048 return convert_half4(ldexp(convert_float4(v), exponent));
2049 }
2050
2051 H_FUNC_HN(length);
2052 HN_FUNC_HN(lgamma);
2053
lgamma(half h,int * signp)2054 extern half __attribute__((overloadable)) lgamma(half h, int *signp) {
2055 return (half) lgamma((float) h, signp);
2056 }
lgamma(half2 v,int2 * signp)2057 extern half2 __attribute__((overloadable)) lgamma(half2 v, int2 *signp) {
2058 return convert_half2(lgamma(convert_float2(v), signp));
2059 }
lgamma(half3 v,int3 * signp)2060 extern half3 __attribute__((overloadable)) lgamma(half3 v, int3 *signp) {
2061 return convert_half3(lgamma(convert_float3(v), signp));
2062 }
lgamma(half4 v,int4 * signp)2063 extern half4 __attribute__((overloadable)) lgamma(half4 v, int4 *signp) {
2064 return convert_half4(lgamma(convert_float4(v), signp));
2065 }
2066
2067 HN_FUNC_HN(log);
2068 HN_FUNC_HN(log10);
2069 HN_FUNC_HN(log1p);
2070 HN_FUNC_HN(log2);
2071 HN_FUNC_HN(logb);
2072
2073 HN_FUNC_HN_HN_HN(mad);
2074 HN_FUNC_HN_HN(max);
2075 HN_FUNC_HN_H(max); // TODO can this be arch-specific similar to _Z3maxDv2_ff?
2076 HN_FUNC_HN_HN(min);
2077 HN_FUNC_HN_H(min); // TODO can this be arch-specific similar to _Z3minDv2_ff?
2078
mix(half start,half stop,half amount)2079 extern half __attribute__((overloadable)) mix(half start, half stop, half amount) {
2080 return start + (stop - start) * amount;
2081 }
mix(half2 start,half2 stop,half2 amount)2082 extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half2 amount) {
2083 return start + (stop - start) * amount;
2084 }
mix(half3 start,half3 stop,half3 amount)2085 extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half3 amount) {
2086 return start + (stop - start) * amount;
2087 }
mix(half4 start,half4 stop,half4 amount)2088 extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half4 amount) {
2089 return start + (stop - start) * amount;
2090 }
mix(half2 start,half2 stop,half amount)2091 extern half2 __attribute__((overloadable)) mix(half2 start, half2 stop, half amount) {
2092 return start + (stop - start) * amount;
2093 }
mix(half3 start,half3 stop,half amount)2094 extern half3 __attribute__((overloadable)) mix(half3 start, half3 stop, half amount) {
2095 return start + (stop - start) * amount;
2096 }
mix(half4 start,half4 stop,half amount)2097 extern half4 __attribute__((overloadable)) mix(half4 start, half4 stop, half amount) {
2098 return start + (stop - start) * amount;
2099 }
2100
2101 extern half __attribute__((overloadable)) modf(half x, half *iptr);
2102 SCALARIZE_HN_FUNC_HN_PHN(modf);
2103
nan_half()2104 half __attribute__((overloadable)) nan_half() {
2105 unsigned short nan_short = kHalfPositiveInfinity | 0x0200;
2106 half nan;
2107 SET_HALF_WORD(nan, nan_short);
2108 return nan;
2109 }
2110
2111 HN_FUNC_HN(normalize);
2112
2113 extern half __attribute__((overloadable)) nextafter(half x, half y);
2114 SCALARIZE_HN_FUNC_HN_HN(nextafter);
2115
2116 HN_FUNC_HN_HN(pow);
2117 HN_FUNC_HN_IN(pown);
2118 HN_FUNC_HN_HN(powr);
2119 HN_FUNC_HN(radians);
2120 HN_FUNC_HN_HN(remainder);
2121
remquo(half n,half d,int * quo)2122 extern half __attribute__((overloadable)) remquo(half n, half d, int *quo) {
2123 return (float) remquo((float) n, (float) d, quo);
2124 }
remquo(half2 n,half2 d,int2 * quo)2125 extern half2 __attribute__((overloadable)) remquo(half2 n, half2 d, int2 *quo) {
2126 return convert_half2(remquo(convert_float2(d), convert_float2(n), quo));
2127 }
remquo(half3 n,half3 d,int3 * quo)2128 extern half3 __attribute__((overloadable)) remquo(half3 n, half3 d, int3 *quo) {
2129 return convert_half3(remquo(convert_float3(d), convert_float3(n), quo));
2130 }
remquo(half4 n,half4 d,int4 * quo)2131 extern half4 __attribute__((overloadable)) remquo(half4 n, half4 d, int4 *quo) {
2132 return convert_half4(remquo(convert_float4(d), convert_float4(n), quo));
2133 }
2134
2135 HN_FUNC_HN(rint);
2136 HN_FUNC_HN_IN(rootn);
2137 HN_FUNC_HN(round);
2138 HN_FUNC_HN(rsqrt);
2139
sign(half h)2140 extern half __attribute__((overloadable)) sign(half h) {
2141 if (h > 0) return (half) 1.f;
2142 if (h < 0) return (half) -1.f;
2143 return h;
2144 }
sign(half2 v)2145 extern half2 __attribute__((overloadable)) sign(half2 v) {
2146 half2 ret;
2147 ret.x = sign(v.x);
2148 ret.y = sign(v.y);
2149 return ret;
2150 }
sign(half3 v)2151 extern half3 __attribute__((overloadable)) sign(half3 v) {
2152 half3 ret;
2153 ret.x = sign(v.x);
2154 ret.y = sign(v.y);
2155 ret.z = sign(v.z);
2156 return ret;
2157 }
sign(half4 v)2158 extern half4 __attribute__((overloadable)) sign(half4 v) {
2159 half4 ret;
2160 ret.x = sign(v.x);
2161 ret.y = sign(v.y);
2162 ret.z = sign(v.z);
2163 ret.w = sign(v.w);
2164 return ret;
2165 }
2166
2167 HN_FUNC_HN(sin);
2168
sincos(half v,half * cosptr)2169 extern half __attribute__((overloadable)) sincos(half v, half *cosptr) {
2170 *cosptr = cos(v);
2171 return sin(v);
2172 }
2173 // TODO verify if LLVM eliminates the duplicate convert_float2
sincos(half2 v,half2 * cosptr)2174 extern half2 __attribute__((overloadable)) sincos(half2 v, half2 *cosptr) {
2175 *cosptr = cos(v);
2176 return sin(v);
2177 }
sincos(half3 v,half3 * cosptr)2178 extern half3 __attribute__((overloadable)) sincos(half3 v, half3 *cosptr) {
2179 *cosptr = cos(v);
2180 return sin(v);
2181 }
sincos(half4 v,half4 * cosptr)2182 extern half4 __attribute__((overloadable)) sincos(half4 v, half4 *cosptr) {
2183 *cosptr = cos(v);
2184 return sin(v);
2185 }
2186
2187 HN_FUNC_HN(sinh);
2188 HN_FUNC_HN(sinpi);
2189 HN_FUNC_HN(sqrt);
2190
step(half edge,half v)2191 extern half __attribute__((overloadable)) step(half edge, half v) {
2192 return (v < edge) ? 0.f : 1.f;
2193 }
step(half2 edge,half2 v)2194 extern half2 __attribute__((overloadable)) step(half2 edge, half2 v) {
2195 half2 r;
2196 r.x = (v.x < edge.x) ? 0.f : 1.f;
2197 r.y = (v.y < edge.y) ? 0.f : 1.f;
2198 return r;
2199 }
step(half3 edge,half3 v)2200 extern half3 __attribute__((overloadable)) step(half3 edge, half3 v) {
2201 half3 r;
2202 r.x = (v.x < edge.x) ? 0.f : 1.f;
2203 r.y = (v.y < edge.y) ? 0.f : 1.f;
2204 r.z = (v.z < edge.z) ? 0.f : 1.f;
2205 return r;
2206 }
step(half4 edge,half4 v)2207 extern half4 __attribute__((overloadable)) step(half4 edge, half4 v) {
2208 half4 r;
2209 r.x = (v.x < edge.x) ? 0.f : 1.f;
2210 r.y = (v.y < edge.y) ? 0.f : 1.f;
2211 r.z = (v.z < edge.z) ? 0.f : 1.f;
2212 r.w = (v.w < edge.w) ? 0.f : 1.f;
2213 return r;
2214 }
step(half2 edge,half v)2215 extern half2 __attribute__((overloadable)) step(half2 edge, half v) {
2216 half2 r;
2217 r.x = (v < edge.x) ? 0.f : 1.f;
2218 r.y = (v < edge.y) ? 0.f : 1.f;
2219 return r;
2220 }
step(half3 edge,half v)2221 extern half3 __attribute__((overloadable)) step(half3 edge, half v) {
2222 half3 r;
2223 r.x = (v < edge.x) ? 0.f : 1.f;
2224 r.y = (v < edge.y) ? 0.f : 1.f;
2225 r.z = (v < edge.z) ? 0.f : 1.f;
2226 return r;
2227 }
step(half4 edge,half v)2228 extern half4 __attribute__((overloadable)) step(half4 edge, half v) {
2229 half4 r;
2230 r.x = (v < edge.x) ? 0.f : 1.f;
2231 r.y = (v < edge.y) ? 0.f : 1.f;
2232 r.z = (v < edge.z) ? 0.f : 1.f;
2233 r.w = (v < edge.w) ? 0.f : 1.f;
2234 return r;
2235 }
step(half edge,half2 v)2236 extern half2 __attribute__((overloadable)) step(half edge, half2 v) {
2237 half2 r;
2238 r.x = (v.x < edge) ? 0.f : 1.f;
2239 r.y = (v.y < edge) ? 0.f : 1.f;
2240 return r;
2241 }
step(half edge,half3 v)2242 extern half3 __attribute__((overloadable)) step(half edge, half3 v) {
2243 half3 r;
2244 r.x = (v.x < edge) ? 0.f : 1.f;
2245 r.y = (v.y < edge) ? 0.f : 1.f;
2246 r.z = (v.z < edge) ? 0.f : 1.f;
2247 return r;
2248 }
step(half edge,half4 v)2249 extern half4 __attribute__((overloadable)) step(half edge, half4 v) {
2250 half4 r;
2251 r.x = (v.x < edge) ? 0.f : 1.f;
2252 r.y = (v.y < edge) ? 0.f : 1.f;
2253 r.z = (v.z < edge) ? 0.f : 1.f;
2254 r.w = (v.w < edge) ? 0.f : 1.f;
2255 return r;
2256 }
2257
2258 HN_FUNC_HN(tan);
2259 HN_FUNC_HN(tanh);
2260 HN_FUNC_HN(tanpi);
2261 HN_FUNC_HN(tgamma);
2262 HN_FUNC_HN(trunc); // TODO: rethink: needs half-specific implementation?
2263
2264 HN_FUNC_HN(native_acos);
2265 HN_FUNC_HN(native_acosh);
2266 HN_FUNC_HN(native_acospi);
2267 HN_FUNC_HN(native_asin);
2268 HN_FUNC_HN(native_asinh);
2269 HN_FUNC_HN(native_asinpi);
2270 HN_FUNC_HN(native_atan);
2271 HN_FUNC_HN(native_atanh);
2272 HN_FUNC_HN(native_atanpi);
2273 HN_FUNC_HN_HN(native_atan2);
2274 HN_FUNC_HN_HN(native_atan2pi);
2275
2276 HN_FUNC_HN(native_cbrt);
2277 HN_FUNC_HN(native_cos);
2278 HN_FUNC_HN(native_cosh);
2279 HN_FUNC_HN(native_cospi);
2280
2281 H_FUNC_HN_HN(native_distance);
2282 HN_FUNC_HN_HN(native_divide);
2283
2284 HN_FUNC_HN(native_exp);
2285 HN_FUNC_HN(native_exp10);
2286 HN_FUNC_HN(native_exp2);
2287 HN_FUNC_HN(native_expm1);
2288
2289 HN_FUNC_HN_HN(native_hypot);
2290 H_FUNC_HN(native_length);
2291
2292 HN_FUNC_HN(native_log);
2293 HN_FUNC_HN(native_log10);
2294 HN_FUNC_HN(native_log1p);
2295 HN_FUNC_HN(native_log2);
2296
2297 HN_FUNC_HN(native_normalize);
2298
2299 HN_FUNC_HN_HN(native_powr); // TODO are parameter limits different for half?
2300
2301 HN_FUNC_HN(native_recip);
2302 HN_FUNC_HN_IN(native_rootn);
2303 HN_FUNC_HN(native_rsqrt);
2304
2305 HN_FUNC_HN(native_sin);
2306
native_sincos(half v,half * cosptr)2307 extern half __attribute__((overloadable)) native_sincos(half v, half *cosptr) {
2308 return sincos(v, cosptr);
2309 }
native_sincos(half2 v,half2 * cosptr)2310 extern half2 __attribute__((overloadable)) native_sincos(half2 v, half2 *cosptr) {
2311 return sincos(v, cosptr);
2312 }
native_sincos(half3 v,half3 * cosptr)2313 extern half3 __attribute__((overloadable)) native_sincos(half3 v, half3 *cosptr) {
2314 return sincos(v, cosptr);
2315 }
native_sincos(half4 v,half4 * cosptr)2316 extern half4 __attribute__((overloadable)) native_sincos(half4 v, half4 *cosptr) {
2317 return sincos(v, cosptr);
2318 }
2319
2320 HN_FUNC_HN(native_sinh);
2321 HN_FUNC_HN(native_sinpi);
2322 HN_FUNC_HN(native_sqrt);
2323
2324 HN_FUNC_HN(native_tan);
2325 HN_FUNC_HN(native_tanh);
2326 HN_FUNC_HN(native_tanpi);
2327
2328 #undef HN_FUNC_HN
2329 #undef HN_FUNC_HN_HN
2330 #undef HN_FUNC_HN_H
2331 #undef HN_FUNC_HN_HN_HN
2332 #undef HN_FUNC_HN_IN
2333 #undef H_FUNC_HN
2334 #undef H_FUNC_HN_HN
2335 #undef SCALARIZE_HN_FUNC_HN_HN
2336
2337 // exports unavailable mathlib functions to compat lib
2338
2339 #ifdef RS_COMPATIBILITY_LIB
2340
2341 // !!! DANGER !!!
2342 // These functions are potentially missing on older Android versions.
2343 // Work around the issue by supplying our own variants.
2344 // !!! DANGER !!!
2345
2346 // The logbl() implementation is taken from the latest bionic/, since
2347 // double == long double on Android.
logbl(long double x)2348 extern "C" long double logbl(long double x) { return logb(x); }
2349
2350 // __aeabi_idiv0 is a missing function in libcompiler_rt.so, so we just
2351 // pick the simplest implementation based on the ARM EABI doc.
__aeabi_idiv0(int v)2352 extern "C" int __aeabi_idiv0(int v) { return v; }
2353
2354 #endif // compatibility lib
2355