1.section #gm107_builtin_code 2// DIV U32 3// 4// UNR recurrence (q = a / b): 5// look for z such that 2^32 - b <= b * z < 2^32 6// then q - 1 <= (a * z) / 2^32 <= q 7// 8// INPUT: $r0: dividend, $r1: divisor 9// OUTPUT: $r0: result, $r1: modulus 10// CLOBBER: $r2 - $r3, $p0 - $p1 11// SIZE: 22 / 14 * 8 bytes 12// 13gm107_div_u32: 14 sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6) 15 flo u32 $r2 $r1 16 lop xor 1 $r2 $r2 0x1f 17 mov $r3 0x1 0xf 18 sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1) 19 shl $r2 $r3 $r2 20 i2i u32 u32 $r1 neg $r1 21 imul u32 u32 $r3 $r1 $r2 22 sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) 23 imad u32 u32 hi $r2 $r2 $r3 $r2 24 imul u32 u32 $r3 $r1 $r2 25 imad u32 u32 hi $r2 $r2 $r3 $r2 26 sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) 27 imul u32 u32 $r3 $r1 $r2 28 imad u32 u32 hi $r2 $r2 $r3 $r2 29 imul u32 u32 $r3 $r1 $r2 30 sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) 31 imad u32 u32 hi $r2 $r2 $r3 $r2 32 imul u32 u32 $r3 $r1 $r2 33 imad u32 u32 hi $r2 $r2 $r3 $r2 34 sched (st 0x6) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2) 35 mov $r3 $r0 0xf 36 imul u32 u32 hi $r0 $r0 $r2 37 i2i u32 u32 $r2 neg $r1 38 sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1) 39 imad u32 u32 $r1 $r1 $r0 $r3 40 isetp ge u32 and $p0 1 $r1 $r2 1 41 $p0 iadd $r1 $r1 neg $r2 42 sched (st 0x5) (st 0xd) (st 0x1) 43 $p0 iadd $r0 $r0 0x1 44 $p0 isetp ge u32 and $p0 1 $r1 $r2 1 45 $p0 iadd $r1 $r1 neg $r2 46 sched (st 0x1) (st 0xf) (st 0xf) 47 $p0 iadd $r0 $r0 0x1 48 ret 49 nop 0 50 51// DIV S32, like DIV U32 after taking ABS(inputs) 52// 53// INPUT: $r0: dividend, $r1: divisor 54// OUTPUT: $r0: result, $r1: modulus 55// CLOBBER: $r2 - $r3, $p0 - $p3 56// 57gm107_div_s32: 58 sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0) 59 isetp lt and $p2 0x1 $r0 0 1 60 isetp lt xor $p3 1 $r1 0 $p2 61 i2i s32 s32 $r0 abs $r0 62 sched (st 0xf wr 0x1) (st 0xd wr 0x1 wt 0x2) (st 0x1 wt 0x2) 63 i2i s32 s32 $r1 abs $r1 64 flo u32 $r2 $r1 65 lop xor 1 $r2 $r2 0x1f 66 sched (st 0x6) (st 0x1) (st 0xf wr 0x1) 67 mov $r3 0x1 0xf 68 shl $r2 $r3 $r2 69 i2i u32 u32 $r1 neg $r1 70 sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) 71 imul u32 u32 $r3 $r1 $r2 72 imad u32 u32 hi $r2 $r2 $r3 $r2 73 imul u32 u32 $r3 $r1 $r2 74 sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) 75 imad u32 u32 hi $r2 $r2 $r3 $r2 76 imul u32 u32 $r3 $r1 $r2 77 imad u32 u32 hi $r2 $r2 $r3 $r2 78 sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) 79 imul u32 u32 $r3 $r1 $r2 80 imad u32 u32 hi $r2 $r2 $r3 $r2 81 imul u32 u32 $r3 $r1 $r2 82 sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2) 83 imad u32 u32 hi $r2 $r2 $r3 $r2 84 mov $r3 $r0 0xf 85 imul u32 u32 hi $r0 $r0 $r2 86 sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3) 87 i2i u32 u32 $r2 neg $r1 88 imad u32 u32 $r1 $r1 $r0 $r3 89 isetp ge u32 and $p0 1 $r1 $r2 1 90 sched (st 0x1) (st 0x5) (st 0xd) 91 $p0 iadd $r1 $r1 neg $r2 92 $p0 iadd $r0 $r0 0x1 93 $p0 isetp ge u32 and $p0 1 $r1 $r2 1 94 sched (st 0x1) (st 0x2) (st 0xf wr 0x0) 95 $p0 iadd $r1 $r1 neg $r2 96 $p0 iadd $r0 $r0 0x1 97 $p3 i2i s32 s32 $r0 neg $r0 98 sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf) 99 $p2 i2i s32 s32 $r1 neg $r1 100 ret 101 nop 0 102 103// STUB 104gm107_rcp_f64: 105gm107_rsq_f64: 106 sched (st 0x0) (st 0x0) (st 0x0) 107 ret 108 nop 0 109 nop 0 110 111.section #gm107_builtin_offsets 112.b64 #gm107_div_u32 113.b64 #gm107_div_s32 114.b64 #gm107_rcp_f64 115.b64 #gm107_rsq_f64 116