1//===----------------------Hexagon builtin routine ------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9// Double Precision Multiply 10#define A r1:0 11#define AH r1 12#define AL r0 13#define B r3:2 14#define BH r3 15#define BL r2 16 17#define BTMP r5:4 18#define BTMPH r5 19#define BTMPL r4 20 21#define PP_ODD r7:6 22#define PP_ODD_H r7 23#define PP_ODD_L r6 24 25#define ONE r9:8 26#define S_ONE r8 27#define S_ZERO r9 28 29#define PP_HH r11:10 30#define PP_HH_H r11 31#define PP_HH_L r10 32 33#define ATMP r13:12 34#define ATMPH r13 35#define ATMPL r12 36 37#define PP_LL r15:14 38#define PP_LL_H r15 39#define PP_LL_L r14 40 41#define TMP r28 42 43#define MANTBITS 52 44#define HI_MANTBITS 20 45#define EXPBITS 11 46#define BIAS 1024 47#define MANTISSA_TO_INT_BIAS 52 48 49// Some constant to adjust normalization amount in error code 50// Amount to right shift the partial product to get to a denorm 51#define FUDGE 5 52 53#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG 54#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG 55#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG 56#define END(TAG) .size TAG,.-TAG 57 58#define SR_ROUND_OFF 22 59 .text 60 .global __hexagon_muldf3 61 .type __hexagon_muldf3,@function 62 Q6_ALIAS(muldf3) 63 FAST_ALIAS(muldf3) 64 FAST2_ALIAS(muldf3) 65 .p2align 5 66__hexagon_muldf3: 67 { 68 p0 = dfclass(A,#2) 69 p0 = dfclass(B,#2) 70 ATMP = combine(##0x40000000,#0) 71 } 72 { 73 ATMP = insert(A,#MANTBITS,#EXPBITS-1) 74 BTMP = asl(B,#EXPBITS-1) 75 TMP = #-BIAS 76 ONE = #1 77 } 78 { 79 PP_ODD = mpyu(BTMPL,ATMPH) 80 BTMP = insert(ONE,#2,#62) 81 } 82 // since we know that the MSB of the H registers is zero, we should never carry 83 // H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1 84 // Adding 2 HLs, we get 2^64-3*2^32+2 maximum. 85 // Therefore, we can add 3 2^32-1 values safely without carry. We only need one. 86 { 87 PP_LL = mpyu(ATMPL,BTMPL) 88 PP_ODD += mpyu(ATMPL,BTMPH) 89 } 90 { 91 PP_ODD += lsr(PP_LL,#32) 92 PP_HH = mpyu(ATMPH,BTMPH) 93 BTMP = combine(##BIAS+BIAS-4,#0) 94 } 95 { 96 PP_HH += lsr(PP_ODD,#32) 97 if (!p0) jump .Lmul_abnormal 98 p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0? 99 p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0? 100 } 101 102 // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts 103 // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so 104 105#undef PP_ODD 106#undef PP_ODD_H 107#undef PP_ODD_L 108#define EXP10 r7:6 109#define EXP1 r7 110#define EXP0 r6 111 { 112 if (!p1) PP_HH_L = or(PP_HH_L,S_ONE) 113 EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS) 114 EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS) 115 } 116 { 117 PP_LL = neg(PP_HH) 118 EXP0 += add(TMP,EXP1) 119 TMP = xor(AH,BH) 120 } 121 { 122 if (!p2.new) PP_HH = PP_LL 123 p2 = cmp.gt(TMP,#-1) 124 p0 = !cmp.gt(EXP0,BTMPH) 125 p0 = cmp.gt(EXP0,BTMPL) 126 if (!p0.new) jump:nt .Lmul_ovf_unf 127 } 128 { 129 A = convert_d2df(PP_HH) 130 EXP0 = add(EXP0,#-BIAS-58) 131 } 132 { 133 AH += asl(EXP0,#HI_MANTBITS) 134 jumpr r31 135 } 136 137 .falign 138.Lpossible_unf: 139 // We end up with a positive exponent 140 // But we may have rounded up to an exponent of 1. 141 // If the exponent is 1, if we rounded up to it 142 // we need to also raise underflow 143 // Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000 144 // And the PP should also have more than one bit set 145 // 146 // Note: ATMP should have abs(PP_HH) 147 // Note: BTMPL should have 0x7FEFFFFF 148 { 149 p0 = cmp.eq(AL,#0) 150 p0 = bitsclr(AH,BTMPL) 151 if (!p0.new) jumpr:t r31 152 BTMPH = #0x7fff 153 } 154 { 155 p0 = bitsset(ATMPH,BTMPH) 156 BTMPL = USR 157 BTMPH = #0x030 158 } 159 { 160 if (p0) BTMPL = or(BTMPL,BTMPH) 161 } 162 { 163 USR = BTMPL 164 } 165 { 166 p0 = dfcmp.eq(A,A) 167 jumpr r31 168 } 169 .falign 170.Lmul_ovf_unf: 171 { 172 A = convert_d2df(PP_HH) 173 ATMP = abs(PP_HH) // take absolute value 174 EXP1 = add(EXP0,#-BIAS-58) 175 } 176 { 177 AH += asl(EXP1,#HI_MANTBITS) 178 EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS) 179 BTMPL = ##0x7FEFFFFF 180 } 181 { 182 EXP1 += add(EXP0,##-BIAS-58) 183 //BTMPH = add(clb(ATMP),#-2) 184 BTMPH = #0 185 } 186 { 187 p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow 188 if (p0.new) jump:nt .Lmul_ovf 189 } 190 { 191 p0 = cmp.gt(EXP1,#0) 192 if (p0.new) jump:nt .Lpossible_unf 193 BTMPH = sub(EXP0,BTMPH) 194 TMP = #63 // max amount to shift 195 } 196 // Underflow 197 // 198 // PP_HH has the partial product with sticky LSB. 199 // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts 200 // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so 201 // The exponent of PP_HH is in EXP1, which is non-positive (0 or negative) 202 // That's the exponent that happens after the normalization 203 // 204 // EXP0 has the exponent that, when added to the normalized value, is out of range. 205 // 206 // Strategy: 207 // 208 // * Shift down bits, with sticky bit, such that the bits are aligned according 209 // to the LZ count and appropriate exponent, but not all the way to mantissa 210 // field, keep around the last few bits. 211 // * Put a 1 near the MSB 212 // * Check the LSBs for inexact; if inexact also set underflow 213 // * Convert [u]d2df -- will correctly round according to rounding mode 214 // * Replace exponent field with zero 215 216 { 217 BTMPL = #0 // offset for extract 218 BTMPH = sub(#FUDGE,BTMPH) // amount to right shift 219 } 220 { 221 p3 = cmp.gt(PP_HH_H,#-1) // is it positive? 222 BTMPH = min(BTMPH,TMP) // Don't shift more than 63 223 PP_HH = ATMP 224 } 225 { 226 TMP = USR 227 PP_LL = extractu(PP_HH,BTMP) 228 } 229 { 230 PP_HH = asr(PP_HH,BTMPH) 231 BTMPL = #0x0030 // underflow flag 232 AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS) 233 } 234 { 235 p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros? 236 if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit 237 PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction 238 } 239 { 240 PP_LL = neg(PP_HH) 241 p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear? 242 if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow 243 } 244 { 245 if (!p3) PP_HH = PP_LL 246 USR = TMP 247 } 248 { 249 A = convert_d2df(PP_HH) // Do rounding 250 p0 = dfcmp.eq(A,A) // realize exception 251 } 252 { 253 AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent 254 jumpr r31 255 } 256 .falign 257.Lmul_ovf: 258 // We get either max finite value or infinity. Either way, overflow+inexact 259 { 260 TMP = USR 261 ATMP = combine(##0x7fefffff,#-1) // positive max finite 262 A = PP_HH 263 } 264 { 265 PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits 266 TMP = or(TMP,#0x28) // inexact + overflow 267 BTMP = combine(##0x7ff00000,#0) // positive infinity 268 } 269 { 270 USR = TMP 271 PP_LL_L ^= lsr(AH,#31) // Does sign match rounding? 272 TMP = PP_LL_L // unmodified rounding mode 273 } 274 { 275 p0 = !cmp.eq(TMP,#1) // If not round-to-zero and 276 p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way, 277 if (p0.new) ATMP = BTMP // we should get infinity 278 p0 = dfcmp.eq(A,A) // Realize FP exception if enabled 279 } 280 { 281 A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign 282 jumpr r31 283 } 284 285.Lmul_abnormal: 286 { 287 ATMP = extractu(A,#63,#0) // strip off sign 288 BTMP = extractu(B,#63,#0) // strip off sign 289 } 290 { 291 p3 = cmp.gtu(ATMP,BTMP) 292 if (!p3.new) A = B // sort values 293 if (!p3.new) B = A // sort values 294 } 295 { 296 // Any NaN --> NaN, possibly raise invalid if sNaN 297 p0 = dfclass(A,#0x0f) // A not NaN? 298 if (!p0.new) jump:nt .Linvalid_nan 299 if (!p3) ATMP = BTMP 300 if (!p3) BTMP = ATMP 301 } 302 { 303 // Infinity * nonzero number is infinity 304 p1 = dfclass(A,#0x08) // A is infinity 305 p1 = dfclass(B,#0x0e) // B is nonzero 306 } 307 { 308 // Infinity * zero --> NaN, raise invalid 309 // Other zeros return zero 310 p0 = dfclass(A,#0x08) // A is infinity 311 p0 = dfclass(B,#0x01) // B is zero 312 } 313 { 314 if (p1) jump .Ltrue_inf 315 p2 = dfclass(B,#0x01) 316 } 317 { 318 if (p0) jump .Linvalid_zeroinf 319 if (p2) jump .Ltrue_zero // so return zero 320 TMP = ##0x7c000000 321 } 322 // We are left with a normal or subnormal times a subnormal. A > B 323 // If A and B are both very small (exp(a) < BIAS-MANTBITS), 324 // we go to a single sticky bit, which we can round easily. 325 // If A and B might multiply to something bigger, decrease A exponent and increase 326 // B exponent and try again 327 { 328 p0 = bitsclr(AH,TMP) 329 if (p0.new) jump:nt .Lmul_tiny 330 } 331 { 332 TMP = cl0(BTMP) 333 } 334 { 335 TMP = add(TMP,#-EXPBITS) 336 } 337 { 338 BTMP = asl(BTMP,TMP) 339 } 340 { 341 B = insert(BTMP,#63,#0) 342 AH -= asl(TMP,#HI_MANTBITS) 343 } 344 jump __hexagon_muldf3 345.Lmul_tiny: 346 { 347 TMP = USR 348 A = xor(A,B) // get sign bit 349 } 350 { 351 TMP = or(TMP,#0x30) // Inexact + Underflow 352 A = insert(ONE,#63,#0) // put in rounded up value 353 BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode 354 } 355 { 356 USR = TMP 357 p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf? 358 if (!p0.new) AL = #0 // If not, zero 359 BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB 360 } 361 { 362 p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf 363 if (!p0.new) AL = #0 // don't go to zero 364 jumpr r31 365 } 366.Linvalid_zeroinf: 367 { 368 TMP = USR 369 } 370 { 371 A = #-1 372 TMP = or(TMP,#2) 373 } 374 { 375 USR = TMP 376 } 377 { 378 p0 = dfcmp.uo(A,A) // force exception if enabled 379 jumpr r31 380 } 381.Linvalid_nan: 382 { 383 p0 = dfclass(B,#0x0f) // if B is not NaN 384 TMP = convert_df2sf(A) // will generate invalid if sNaN 385 if (p0.new) B = A // make it whatever A is 386 } 387 { 388 BL = convert_df2sf(B) // will generate invalid if sNaN 389 A = #-1 390 jumpr r31 391 } 392 .falign 393.Ltrue_zero: 394 { 395 A = B 396 B = A 397 } 398.Ltrue_inf: 399 { 400 BH = extract(BH,#1,#31) 401 } 402 { 403 AH ^= asl(BH,#31) 404 jumpr r31 405 } 406END(__hexagon_muldf3) 407 408#undef ATMP 409#undef ATMPL 410#undef ATMPH 411#undef BTMP 412#undef BTMPL 413#undef BTMPH 414