1.section #gk110_builtin_code 2// DIV U32 3// 4// UNR recurrence (q = a / b): 5// look for z such that 2^32 - b <= b * z < 2^32 6// then q - 1 <= (a * z) / 2^32 <= q 7// 8// INPUT: $r0: dividend, $r1: divisor 9// OUTPUT: $r0: result, $r1: modulus 10// CLOBBER: $r2 - $r3, $p0 - $p1 11// SIZE: 22 / 14 * 8 bytes 12// 13gk110_div_u32: 14 sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28 15 bfind u32 $r2 $r1 16 xor b32 $r2 $r2 0x1f 17 mov b32 $r3 0x1 18 shl b32 $r2 $r3 clamp $r2 19 cvt u32 $r1 neg u32 $r1 20 mul $r3 u32 $r1 u32 $r2 21 add $r2 (mul high u32 $r2 u32 $r3) $r2 22 sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 23 mul $r3 u32 $r1 u32 $r2 24 add $r2 (mul high u32 $r2 u32 $r3) $r2 25 mul $r3 u32 $r1 u32 $r2 26 add $r2 (mul high u32 $r2 u32 $r3) $r2 27 mul $r3 u32 $r1 u32 $r2 28 add $r2 (mul high u32 $r2 u32 $r3) $r2 29 mul $r3 u32 $r1 u32 $r2 30 sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04 31 add $r2 (mul high u32 $r2 u32 $r3) $r2 32 mov b32 $r3 $r0 33 mul high $r0 u32 $r0 u32 $r2 34 cvt u32 $r2 neg u32 $r1 35 add $r1 (mul u32 $r1 u32 $r0) $r3 36 set $p0 0x1 ge u32 $r1 $r2 37 $p0 sub b32 $r1 $r1 $r2 38 sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20 39 $p0 add b32 $r0 $r0 0x1 40 $p0 set $p0 0x1 ge u32 $r1 $r2 41 $p0 sub b32 $r1 $r1 $r2 42 $p0 add b32 $r0 $r0 0x1 43 ret 44 45// DIV S32, like DIV U32 after taking ABS(inputs) 46// 47// INPUT: $r0: dividend, $r1: divisor 48// OUTPUT: $r0: result, $r1: modulus 49// CLOBBER: $r2 - $r3, $p0 - $p3 50// 51gk110_div_s32: 52 set $p2 0x1 lt s32 $r0 0x0 53 set $p3 0x1 lt s32 $r1 0x0 xor $p2 54 sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28 55 cvt s32 $r0 abs s32 $r0 56 cvt s32 $r1 abs s32 $r1 57 bfind u32 $r2 $r1 58 xor b32 $r2 $r2 0x1f 59 mov b32 $r3 0x1 60 shl b32 $r2 $r3 clamp $r2 61 cvt u32 $r1 neg u32 $r1 62 sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 63 mul $r3 u32 $r1 u32 $r2 64 add $r2 (mul high u32 $r2 u32 $r3) $r2 65 mul $r3 u32 $r1 u32 $r2 66 add $r2 (mul high u32 $r2 u32 $r3) $r2 67 mul $r3 u32 $r1 u32 $r2 68 add $r2 (mul high u32 $r2 u32 $r3) $r2 69 mul $r3 u32 $r1 u32 $r2 70 sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28 71 add $r2 (mul high u32 $r2 u32 $r3) $r2 72 mul $r3 u32 $r1 u32 $r2 73 add $r2 (mul high u32 $r2 u32 $r3) $r2 74 mov b32 $r3 $r0 75 mul high $r0 u32 $r0 u32 $r2 76 cvt u32 $r2 neg u32 $r1 77 add $r1 (mul u32 $r1 u32 $r0) $r3 78 sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20 79 set $p0 0x1 ge u32 $r1 $r2 80 $p0 sub b32 $r1 $r1 $r2 81 $p0 add b32 $r0 $r0 0x1 82 $p0 set $p0 0x1 ge u32 $r1 $r2 83 $p0 sub b32 $r1 $r1 $r2 84 $p0 add b32 $r0 $r0 0x1 85 $p3 cvt s32 $r0 neg s32 $r0 86 sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28 87 $p2 cvt s32 $r1 neg s32 $r1 88 ret 89 90// RCP F64 91// 92// INPUT: $r0d 93// OUTPUT: $r0d 94// CLOBBER: $r2 - $r9, $p0 95// 96// The core of RCP and RSQ implementation is Newton-Raphson step, which is 97// used to find successively better approximation from an imprecise initial 98// value (single precision rcp in RCP and rsqrt64h in RSQ). 99// 100gk110_rcp_f64: 101 // Step 1: classify input according to exponent and value, and calculate 102 // result for 0/inf/nan. $r2 holds the exponent value, which starts at 103 // bit 52 (bit 20 of the upper half) and is 11 bits in length 104 ext u32 $r2 $r1 0xb14 105 add b32 $r3 $r2 0xffffffff 106 joinat #rcp_rejoin 107 // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, 108 // denorm, or 0). Do this by substracting 1 from the exponent, which will 109 // mean that it's > 0x7fd in those cases when doing unsigned comparison 110 set b32 $p0 0x1 gt u32 $r3 0x7fd 111 // $r3: 0 for norms, 0x36 for denorms, -1 for others 112 mov b32 $r3 0x0 113 sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 114 join (not $p0) nop 115 // Process all special values: NaN, inf, denorm, 0 116 mov b32 $r3 0xffffffff 117 // A number is NaN if its abs value is greater than or unordered with inf 118 set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 119 (not $p0) bra #rcp_inf_or_denorm_or_zero 120 // NaN -> NaN, the next line sets the "quiet" bit of the result. This 121 // behavior is both seen on the CPU and the blob 122 join or b32 $r1 $r1 0x80000 123rcp_inf_or_denorm_or_zero: 124 and b32 $r4 $r1 0x7ff00000 125 // Other values with nonzero in exponent field should be inf 126 set b32 $p0 0x1 eq s32 $r4 0x0 127 sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 128 $p0 bra #rcp_denorm_or_zero 129 // +/-Inf -> +/-0 130 xor b32 $r1 $r1 0x7ff00000 131 join mov b32 $r0 0x0 132rcp_denorm_or_zero: 133 set $p0 0x1 gtu f64 abs $r0d 0x0 134 $p0 bra #rcp_denorm 135 // +/-0 -> +/-Inf 136 join or b32 $r1 $r1 0x7ff00000 137rcp_denorm: 138 // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms 139 mul rn f64 $r0d $r0d 0x4350000000000000 140 sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 141 join mov b32 $r3 0x36 142rcp_rejoin: 143 // All numbers with -1 in $r3 have their result ready in $r0d, return them 144 // others need further calculation 145 set b32 $p0 0x1 lt s32 $r3 0x0 146 $p0 bra #rcp_end 147 // Step 2: Before the real calculation goes on, renormalize the values to 148 // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) 149 // result in $r6d. The exponent will be recovered later. 150 ext u32 $r2 $r1 0xb14 151 and b32 $r7 $r1 0x800fffff 152 add b32 $r7 $r7 0x3ff00000 153 mov b32 $r6 $r0 154 sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e 155 // Step 3: Convert new value to float (no overflow will occur due to step 156 // 2), calculate rcp and do newton-raphson step once 157 cvt rz f32 $r5 f64 $r6d 158 rcp f32 $r4 $r5 159 mov b32 $r0 0xbf800000 160 fma rn f32 $r5 $r4 $r5 $r0 161 fma rn f32 $r0 neg $r4 $r5 $r4 162 // Step 4: convert result $r0 back to double, do newton-raphson steps 163 cvt f64 $r0d f32 $r0 164 cvt f64 $r6d f64 neg $r6d 165 sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 166 cvt f64 $r8d f32 0x3f800000 167 // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d 168 // The formula used here (and above) is: 169 // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} 170 // The following code uses 2 FMAs for each step, and it will basically 171 // looks like: 172 // tmp = -src * RCP_{n} + 1 173 // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} 174 fma rn f64 $r4d $r6d $r0d $r8d 175 fma rn f64 $r0d $r0d $r4d $r0d 176 fma rn f64 $r4d $r6d $r0d $r8d 177 fma rn f64 $r0d $r0d $r4d $r0d 178 fma rn f64 $r4d $r6d $r0d $r8d 179 fma rn f64 $r0d $r0d $r4d $r0d 180 sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 181 fma rn f64 $r4d $r6d $r0d $r8d 182 fma rn f64 $r0d $r0d $r4d $r0d 183 // Step 5: Exponent recovery and final processing 184 // The exponent is recovered by adding what we added to the exponent. 185 // Suppose we want to calculate rcp(x), but we have rcp(cx), then 186 // rcp(x) = c * rcp(cx) 187 // The delta in exponent comes from two sources: 188 // 1) The renormalization in step 2. The delta is: 189 // 0x3ff - $r2 190 // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored 191 // in $r3 192 // These 2 sources are calculated in the first two lines below, and then 193 // added to the exponent extracted from the result above. 194 // Note that after processing, the new exponent may >= 0x7ff (inf) 195 // or <= 0 (denorm). Those cases will be handled respectively below 196 subr b32 $r2 $r2 0x3ff 197 add b32 $r4 $r2 $r3 198 ext u32 $r3 $r1 0xb14 199 // New exponent in $r3 200 add b32 $r3 $r3 $r4 201 add b32 $r2 $r3 0xffffffff 202 sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b 203 // (exponent-1) < 0x7fe (unsigned) means the result is in norm range 204 // (same logic as in step 1) 205 set b32 $p0 0x1 lt u32 $r2 0x7fe 206 (not $p0) bra #rcp_result_inf_or_denorm 207 // Norms: convert exponents back and return 208 shl b32 $r4 $r4 clamp 0x14 209 add b32 $r1 $r4 $r1 210 bra #rcp_end 211rcp_result_inf_or_denorm: 212 // New exponent >= 0x7ff means that result is inf 213 set b32 $p0 0x1 ge s32 $r3 0x7ff 214 (not $p0) bra #rcp_result_denorm 215 sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f 216 // Infinity 217 and b32 $r1 $r1 0x80000000 218 mov b32 $r0 0x0 219 add b32 $r1 $r1 0x7ff00000 220 bra #rcp_end 221rcp_result_denorm: 222 // Denorm result comes from huge input. The greatest possible fp64, i.e. 223 // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest 224 // normal value. Other rcp result should be greater than that. If we 225 // set the exponent field to 1, we can recover the result by multiplying 226 // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise 227 // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies 228 // the logic here. 229 set b32 $p0 0x1 ne u32 $r3 0x0 230 and b32 $r1 $r1 0x800fffff 231 // 0x3e800000: 1/4 232 $p0 cvt f64 $r6d f32 0x3e800000 233 sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 234 // 0x3f000000: 1/2 235 (not $p0) cvt f64 $r6d f32 0x3f000000 236 add b32 $r1 $r1 0x00100000 237 mul rn f64 $r0d $r0d $r6d 238rcp_end: 239 ret 240 241// RSQ F64 242// 243// INPUT: $r0d 244// OUTPUT: $r0d 245// CLOBBER: $r2 - $r9, $p0 - $p1 246// 247gk110_rsq_f64: 248 // Before getting initial result rsqrt64h, two special cases should be 249 // handled first. 250 // 1. NaN: set the highest bit in mantissa so it'll be surely recognized 251 // as NaN in rsqrt64h 252 set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 253 $p0 or b32 $r1 $r1 0x00080000 254 and b32 $r2 $r1 0x7fffffff 255 sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 256 // 2. denorms and small normal values: using their original value will 257 // lose precision either at rsqrt64h or the first step in newton-raphson 258 // steps below. Take 2 as a threshold in exponent field, and multiply 259 // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 260 // to recover in the end) 261 ext u32 $r3 $r1 0xb14 262 set b32 $p1 0x1 le u32 $r3 0x2 263 or b32 $r2 $r0 $r2 264 $p1 mul rn f64 $r0d $r0d 0x4350000000000000 265 rsqrt64h f32 $r5 $r1 266 // rsqrt64h will give correct result for 0/inf/nan, the following logic 267 // checks whether the input is one of those (exponent is 0x7ff or all 0 268 // except for the sign bit) 269 set b32 $r6 ne u32 $r3 0x7ff 270 and b32 $r2 $r2 $r6 271 sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 272 set b32 $p0 0x1 ne u32 $r2 0x0 273 $p0 bra #rsq_norm 274 // For 0/inf/nan, make sure the sign bit agrees with input and return 275 and b32 $r1 $r1 0x80000000 276 mov b32 $r0 0x0 277 or b32 $r1 $r1 $r5 278 ret 279rsq_norm: 280 // For others, do 4 Newton-Raphson steps with the formula: 281 // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) 282 // In the code below, each step is written as: 283 // tmp1 = 0.5 * x * RSQ_{n} 284 // tmp2 = -RSQ_{n} * tmp1 + 0.5 285 // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} 286 mov b32 $r4 0x0 287 sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 288 // 0x3f000000: 1/2 289 cvt f64 $r8d f32 0x3f000000 290 mul rn f64 $r2d $r0d $r8d 291 mul rn f64 $r0d $r2d $r4d 292 fma rn f64 $r6d neg $r4d $r0d $r8d 293 fma rn f64 $r4d $r4d $r6d $r4d 294 mul rn f64 $r0d $r2d $r4d 295 fma rn f64 $r6d neg $r4d $r0d $r8d 296 sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 297 fma rn f64 $r4d $r4d $r6d $r4d 298 mul rn f64 $r0d $r2d $r4d 299 fma rn f64 $r6d neg $r4d $r0d $r8d 300 fma rn f64 $r4d $r4d $r6d $r4d 301 mul rn f64 $r0d $r2d $r4d 302 fma rn f64 $r6d neg $r4d $r0d $r8d 303 fma rn f64 $r4d $r4d $r6d $r4d 304 sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 305 // Multiply 2^27 to result for small inputs to recover 306 $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 307 mov b32 $r1 $r5 308 mov b32 $r0 $r4 309 ret 310 311.section #gk110_builtin_offsets 312.b64 #gk110_div_u32 313.b64 #gk110_div_s32 314.b64 #gk110_rcp_f64 315.b64 #gk110_rsq_f64 316