• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.section #gm107_builtin_code
2// DIV U32
3//
4// UNR recurrence (q = a / b):
5// look for z such that 2^32 - b <= b * z < 2^32
6// then q - 1 <= (a * z) / 2^32 <= q
7//
8// INPUT:   $r0: dividend, $r1: divisor
9// OUTPUT:  $r0: result, $r1: modulus
10// CLOBBER: $r2 - $r3, $p0 - $p1
11// SIZE:    22 / 14 * 8 bytes
12//
13gm107_div_u32:
14   sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6)
15   flo u32 $r2 $r1
16   lop xor 1 $r2 $r2 0x1f
17   mov $r3 0x1 0xf
18   sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1)
19   shl $r2 $r3 $r2
20   i2i u32 u32 $r1 neg $r1
21   imul u32 u32 $r3 $r1 $r2
22   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
23   imad u32 u32 hi $r2 $r2 $r3 $r2
24   imul u32 u32 $r3 $r1 $r2
25   imad u32 u32 hi $r2 $r2 $r3 $r2
26   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
27   imul u32 u32 $r3 $r1 $r2
28   imad u32 u32 hi $r2 $r2 $r3 $r2
29   imul u32 u32 $r3 $r1 $r2
30   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
31   imad u32 u32 hi $r2 $r2 $r3 $r2
32   imul u32 u32 $r3 $r1 $r2
33   imad u32 u32 hi $r2 $r2 $r3 $r2
34   sched (st 0x6) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2)
35   mov $r3 $r0 0xf
36   imul u32 u32 hi $r0 $r0 $r2
37   i2i u32 u32 $r2 neg $r1
38   sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1)
39   imad u32 u32 $r1 $r1 $r0 $r3
40   isetp ge u32 and $p0 1 $r1 $r2 1
41   $p0 iadd $r1 $r1 neg $r2
42   sched (st 0x5) (st 0xd) (st 0x1)
43   $p0 iadd $r0 $r0 0x1
44   $p0 isetp ge u32 and $p0 1 $r1 $r2 1
45   $p0 iadd $r1 $r1 neg $r2
46   sched (st 0x1) (st 0xf) (st 0xf)
47   $p0 iadd $r0 $r0 0x1
48   ret
49   nop 0
50
51// DIV S32, like DIV U32 after taking ABS(inputs)
52//
53// INPUT:   $r0: dividend, $r1: divisor
54// OUTPUT:  $r0: result, $r1: modulus
55// CLOBBER: $r2 - $r3, $p0 - $p3
56//
57gm107_div_s32:
58   sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0)
59   isetp lt and $p2 0x1 $r0 0 1
60   isetp lt xor $p3 1 $r1 0 $p2
61   i2i s32 s32 $r0 abs $r0
62   sched (st 0xf wr 0x1) (st 0xd wr 0x1 wt 0x2) (st 0x1 wt 0x2)
63   i2i s32 s32 $r1 abs $r1
64   flo u32 $r2 $r1
65   lop xor 1 $r2 $r2 0x1f
66   sched (st 0x6) (st 0x1) (st 0xf wr 0x1)
67   mov $r3 0x1 0xf
68   shl $r2 $r3 $r2
69   i2i u32 u32 $r1 neg $r1
70   sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
71   imul u32 u32 $r3 $r1 $r2
72   imad u32 u32 hi $r2 $r2 $r3 $r2
73   imul u32 u32 $r3 $r1 $r2
74   sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
75   imad u32 u32 hi $r2 $r2 $r3 $r2
76   imul u32 u32 $r3 $r1 $r2
77   imad u32 u32 hi $r2 $r2 $r3 $r2
78   sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
79   imul u32 u32 $r3 $r1 $r2
80   imad u32 u32 hi $r2 $r2 $r3 $r2
81   imul u32 u32 $r3 $r1 $r2
82   sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2)
83   imad u32 u32 hi $r2 $r2 $r3 $r2
84   mov $r3 $r0 0xf
85   imul u32 u32 hi $r0 $r0 $r2
86   sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3)
87   i2i u32 u32 $r2 neg $r1
88   imad u32 u32 $r1 $r1 $r0 $r3
89   isetp ge u32 and $p0 1 $r1 $r2 1
90   sched (st 0x1) (st 0x5) (st 0xd)
91   $p0 iadd $r1 $r1 neg $r2
92   $p0 iadd $r0 $r0 0x1
93   $p0 isetp ge u32 and $p0 1 $r1 $r2 1
94   sched (st 0x1) (st 0x2) (st 0xf wr 0x0)
95   $p0 iadd $r1 $r1 neg $r2
96   $p0 iadd $r0 $r0 0x1
97   $p3 i2i s32 s32 $r0 neg $r0
98   sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
99   $p2 i2i s32 s32 $r1 neg $r1
100   ret
101   nop 0
102
103// STUB
104gm107_rcp_f64:
105gm107_rsq_f64:
106   sched (st 0x0) (st 0x0) (st 0x0)
107   ret
108   nop 0
109   nop 0
110
111.section #gm107_builtin_offsets
112.b64 #gm107_div_u32
113.b64 #gm107_div_s32
114.b64 #gm107_rcp_f64
115.b64 #gm107_rsq_f64
116