1//===----------------------Hexagon builtin routine ------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG 10#define END(TAG) .size TAG,.-TAG 11 12// Double Precision Multiply 13 14 15#define A r1:0 16#define AH r1 17#define AL r0 18#define B r3:2 19#define BH r3 20#define BL r2 21#define C r5:4 22#define CH r5 23#define CL r4 24 25 26 27#define BTMP r15:14 28#define BTMPH r15 29#define BTMPL r14 30 31#define ATMP r13:12 32#define ATMPH r13 33#define ATMPL r12 34 35#define CTMP r11:10 36#define CTMPH r11 37#define CTMPL r10 38 39#define PP_LL r9:8 40#define PP_LL_H r9 41#define PP_LL_L r8 42 43#define PP_ODD r7:6 44#define PP_ODD_H r7 45#define PP_ODD_L r6 46 47 48#define PP_HH r17:16 49#define PP_HH_H r17 50#define PP_HH_L r16 51 52#define EXPA r18 53#define EXPB r19 54#define EXPBA r19:18 55 56#define TMP r28 57 58#define P_TMP p0 59#define PROD_NEG p3 60#define EXACT p2 61#define SWAP p1 62 63#define MANTBITS 52 64#define HI_MANTBITS 20 65#define EXPBITS 11 66#define BIAS 1023 67#define STACKSPACE 32 68 69#define ADJUST 4 70 71#define FUDGE 7 72#define FUDGE2 3 73 74#ifndef SR_ROUND_OFF 75#define SR_ROUND_OFF 22 76#endif 77 78 // First, classify for normal values, and abort if abnormal 79 // 80 // Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8 81 // 82 // Since we know that the 2 MSBs of the H registers is zero, we should never carry 83 // the partial products that involve the H registers 84 // 85 // Try to buy X slots, at the expense of latency if needed 86 // 87 // We will have PP_HH with the upper bits of the product, PP_LL with the lower 88 // PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts 89 // PP_HH can have a minimum of 0x0100_0000_0000_0000 90 // 91 // 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS 92 // 93 // We need to align CTMP. 94 // If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add 95 // If CTMP << PP align CTMP and add 128 bits. Then compute sticky 96 // If CTMP ~= PP, align CTMP and add 128 bits. May have massive cancellation. 97 // 98 // Convert partial product and CTMP to 2's complement prior to addition 99 // 100 // After we add, we need to normalize into upper 64 bits, then compute sticky. 101 102 .text 103 .global __hexagon_fmadf4 104 .type __hexagon_fmadf4,@function 105 .global __hexagon_fmadf5 106 .type __hexagon_fmadf5,@function 107 Q6_ALIAS(fmadf5) 108 .p2align 5 109__hexagon_fmadf4: 110__hexagon_fmadf5: 111.Lfma_begin: 112 { 113 P_TMP = dfclass(A,#2) 114 P_TMP = dfclass(B,#2) 115 ATMP = #0 116 BTMP = #0 117 } 118 { 119 ATMP = insert(A,#MANTBITS,#EXPBITS-3) 120 BTMP = insert(B,#MANTBITS,#EXPBITS-3) 121 PP_ODD_H = ##0x10000000 122 allocframe(#STACKSPACE) 123 } 124 { 125 PP_LL = mpyu(ATMPL,BTMPL) 126 if (!P_TMP) jump .Lfma_abnormal_ab 127 ATMPH = or(ATMPH,PP_ODD_H) 128 BTMPH = or(BTMPH,PP_ODD_H) 129 } 130 { 131 P_TMP = dfclass(C,#2) 132 if (!P_TMP.new) jump:nt .Lfma_abnormal_c 133 CTMP = combine(PP_ODD_H,#0) 134 PP_ODD = combine(#0,PP_LL_H) 135 } 136.Lfma_abnormal_c_restart: 137 { 138 PP_ODD += mpyu(BTMPL,ATMPH) 139 CTMP = insert(C,#MANTBITS,#EXPBITS-3) 140 memd(r29+#0) = PP_HH 141 memd(r29+#8) = EXPBA 142 } 143 { 144 PP_ODD += mpyu(ATMPL,BTMPH) 145 EXPBA = neg(CTMP) 146 P_TMP = cmp.gt(CH,#-1) 147 TMP = xor(AH,BH) 148 } 149 { 150 EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS) 151 EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS) 152 PP_HH = combine(#0,PP_ODD_H) 153 if (!P_TMP) CTMP = EXPBA 154 } 155 { 156 PP_HH += mpyu(ATMPH,BTMPH) 157 PP_LL = combine(PP_ODD_L,PP_LL_L) 158#undef PP_ODD 159#undef PP_ODD_H 160#undef PP_ODD_L 161#undef ATMP 162#undef ATMPL 163#undef ATMPH 164#undef BTMP 165#undef BTMPL 166#undef BTMPH 167#define RIGHTLEFTSHIFT r13:12 168#define RIGHTSHIFT r13 169#define LEFTSHIFT r12 170 171 EXPA = add(EXPA,EXPB) 172#undef EXPB 173#undef EXPBA 174#define EXPC r19 175#define EXPCA r19:18 176 EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS) 177 } 178 // PP_HH:PP_LL now has product 179 // CTMP is negated 180 // EXPA,B,C are extracted 181 // We need to negate PP 182 // Since we will be adding with carry later, if we need to negate, 183 // just invert all bits now, which we can do conditionally and in parallel 184#define PP_HH_TMP r15:14 185#define PP_LL_TMP r7:6 186 { 187 EXPA = add(EXPA,#-BIAS+(ADJUST)) 188 PROD_NEG = !cmp.gt(TMP,#-1) 189 PP_LL_TMP = #0 190 PP_HH_TMP = #0 191 } 192 { 193 PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry 194 P_TMP = !cmp.gt(TMP,#-1) 195 SWAP = cmp.gt(EXPC,EXPA) // If C >> PP 196 if (SWAP.new) EXPCA = combine(EXPA,EXPC) 197 } 198 { 199 PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry 200 if (P_TMP) PP_LL = PP_LL_TMP 201#undef PP_LL_TMP 202#define CTMP2 r7:6 203#define CTMP2H r7 204#define CTMP2L r6 205 CTMP2 = #0 206 EXPC = sub(EXPA,EXPC) 207 } 208 { 209 if (P_TMP) PP_HH = PP_HH_TMP 210 P_TMP = cmp.gt(EXPC,#63) 211 if (SWAP) PP_LL = CTMP2 212 if (SWAP) CTMP2 = PP_LL 213 } 214#undef PP_HH_TMP 215//#define ONE r15:14 216//#define S_ONE r14 217#define ZERO r15:14 218#define S_ZERO r15 219#undef PROD_NEG 220#define P_CARRY p3 221 { 222 if (SWAP) PP_HH = CTMP // Swap C and PP 223 if (SWAP) CTMP = PP_HH 224 if (P_TMP) EXPC = add(EXPC,#-64) 225 TMP = #63 226 } 227 { 228 // If diff > 63, pre-shift-right by 64... 229 if (P_TMP) CTMP2 = CTMP 230 TMP = asr(CTMPH,#31) 231 RIGHTSHIFT = min(EXPC,TMP) 232 LEFTSHIFT = #0 233 } 234#undef C 235#undef CH 236#undef CL 237#define STICKIES r5:4 238#define STICKIESH r5 239#define STICKIESL r4 240 { 241 if (P_TMP) CTMP = combine(TMP,TMP) // sign extension of pre-shift-right-64 242 STICKIES = extract(CTMP2,RIGHTLEFTSHIFT) 243 CTMP2 = lsr(CTMP2,RIGHTSHIFT) 244 LEFTSHIFT = sub(#64,RIGHTSHIFT) 245 } 246 { 247 ZERO = #0 248 TMP = #-2 249 CTMP2 |= lsl(CTMP,LEFTSHIFT) 250 CTMP = asr(CTMP,RIGHTSHIFT) 251 } 252 { 253 P_CARRY = cmp.gtu(STICKIES,ZERO) // If we have sticky bits from C shift 254 if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR 255#undef ZERO 256#define ONE r15:14 257#define S_ONE r14 258 ONE = #1 259 STICKIES = #0 260 } 261 { 262 PP_LL = add(CTMP2,PP_LL,P_CARRY):carry // use the carry to add the sticky 263 } 264 { 265 PP_HH = add(CTMP,PP_HH,P_CARRY):carry 266 TMP = #62 267 } 268 // PP_HH:PP_LL now holds the sum 269 // We may need to normalize left, up to ??? bits. 270 // 271 // I think that if we have massive cancellation, the range we normalize by 272 // is still limited 273 { 274 LEFTSHIFT = add(clb(PP_HH),#-2) 275 if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f // all sign bits? 276 } 277 // We had all sign bits, shift left by 62. 278 { 279 CTMP = extractu(PP_LL,#62,#2) 280 PP_LL = asl(PP_LL,#62) 281 EXPA = add(EXPA,#-62) // And adjust exponent of result 282 } 283 { 284 PP_HH = insert(CTMP,#62,#0) // Then shift 63 285 } 286 { 287 LEFTSHIFT = add(clb(PP_HH),#-2) 288 } 289 .falign 2901: 291 { 292 CTMP = asl(PP_HH,LEFTSHIFT) 293 STICKIES |= asl(PP_LL,LEFTSHIFT) 294 RIGHTSHIFT = sub(#64,LEFTSHIFT) 295 EXPA = sub(EXPA,LEFTSHIFT) 296 } 297 { 298 CTMP |= lsr(PP_LL,RIGHTSHIFT) 299 EXACT = cmp.gtu(ONE,STICKIES) 300 TMP = #BIAS+BIAS-2 301 } 302 { 303 if (!EXACT) CTMPL = or(CTMPL,S_ONE) 304 // If EXPA is overflow/underflow, jump to ovf_unf 305 P_TMP = !cmp.gt(EXPA,TMP) 306 P_TMP = cmp.gt(EXPA,#1) 307 if (!P_TMP.new) jump:nt .Lfma_ovf_unf 308 } 309 { 310 // XXX: FIXME: should PP_HH for check of zero be CTMP? 311 P_TMP = cmp.gtu(ONE,CTMP) // is result true zero? 312 A = convert_d2df(CTMP) 313 EXPA = add(EXPA,#-BIAS-60) 314 PP_HH = memd(r29+#0) 315 } 316 { 317 AH += asl(EXPA,#HI_MANTBITS) 318 EXPCA = memd(r29+#8) 319 if (!P_TMP) dealloc_return // not zero, return 320 } 321.Ladd_yields_zero: 322 // We had full cancellation. Return +/- zero (-0 when round-down) 323 { 324 TMP = USR 325 A = #0 326 } 327 { 328 TMP = extractu(TMP,#2,#SR_ROUND_OFF) 329 PP_HH = memd(r29+#0) 330 EXPCA = memd(r29+#8) 331 } 332 { 333 p0 = cmp.eq(TMP,#2) 334 if (p0.new) AH = ##0x80000000 335 dealloc_return 336 } 337 338#undef RIGHTLEFTSHIFT 339#undef RIGHTSHIFT 340#undef LEFTSHIFT 341#undef CTMP2 342#undef CTMP2H 343#undef CTMP2L 344 345.Lfma_ovf_unf: 346 { 347 p0 = cmp.gtu(ONE,CTMP) 348 if (p0.new) jump:nt .Ladd_yields_zero 349 } 350 { 351 A = convert_d2df(CTMP) 352 EXPA = add(EXPA,#-BIAS-60) 353 TMP = EXPA 354 } 355#define NEW_EXPB r7 356#define NEW_EXPA r6 357 { 358 AH += asl(EXPA,#HI_MANTBITS) 359 NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS) 360 } 361 { 362 NEW_EXPA = add(EXPA,NEW_EXPB) 363 PP_HH = memd(r29+#0) 364 EXPCA = memd(r29+#8) 365#undef PP_HH 366#undef PP_HH_H 367#undef PP_HH_L 368#undef EXPCA 369#undef EXPC 370#undef EXPA 371#undef PP_LL 372#undef PP_LL_H 373#undef PP_LL_L 374#define EXPA r6 375#define EXPB r7 376#define EXPBA r7:6 377#define ATMP r9:8 378#define ATMPH r9 379#define ATMPL r8 380#undef NEW_EXPB 381#undef NEW_EXPA 382 ATMP = abs(CTMP) 383 } 384 { 385 p0 = cmp.gt(EXPA,##BIAS+BIAS) 386 if (p0.new) jump:nt .Lfma_ovf 387 } 388 { 389 p0 = cmp.gt(EXPA,#0) 390 if (p0.new) jump:nt .Lpossible_unf 391 } 392 { 393 // TMP has original EXPA. 394 // ATMP is corresponding value 395 // Normalize ATMP and shift right to correct location 396 EXPB = add(clb(ATMP),#-2) // Amount to left shift to normalize 397 EXPA = sub(#1+5,TMP) // Amount to right shift to denormalize 398 p3 = cmp.gt(CTMPH,#-1) 399 } 400 // Underflow 401 // We know that the infinte range exponent should be EXPA 402 // CTMP is 2's complement, ATMP is abs(CTMP) 403 { 404 EXPA = add(EXPA,EXPB) // how much to shift back right 405 ATMP = asl(ATMP,EXPB) // shift left 406 AH = USR 407 TMP = #63 408 } 409 { 410 EXPB = min(EXPA,TMP) 411 EXPA = #0 412 AL = #0x0030 413 } 414 { 415 B = extractu(ATMP,EXPBA) 416 ATMP = asr(ATMP,EXPB) 417 } 418 { 419 p0 = cmp.gtu(ONE,B) 420 if (!p0.new) ATMPL = or(ATMPL,S_ONE) 421 ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2) 422 } 423 { 424 CTMP = neg(ATMP) 425 p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1) 426 if (!p1.new) AH = or(AH,AL) 427 B = #0 428 } 429 { 430 if (p3) CTMP = ATMP 431 USR = AH 432 TMP = #-BIAS-(MANTBITS+FUDGE2) 433 } 434 { 435 A = convert_d2df(CTMP) 436 } 437 { 438 AH += asl(TMP,#HI_MANTBITS) 439 dealloc_return 440 } 441.Lpossible_unf: 442 { 443 TMP = ##0x7fefffff 444 ATMP = abs(CTMP) 445 } 446 { 447 p0 = cmp.eq(AL,#0) 448 p0 = bitsclr(AH,TMP) 449 if (!p0.new) dealloc_return:t 450 TMP = #0x7fff 451 } 452 { 453 p0 = bitsset(ATMPH,TMP) 454 BH = USR 455 BL = #0x0030 456 } 457 { 458 if (p0) BH = or(BH,BL) 459 } 460 { 461 USR = BH 462 } 463 { 464 p0 = dfcmp.eq(A,A) 465 dealloc_return 466 } 467.Lfma_ovf: 468 { 469 TMP = USR 470 CTMP = combine(##0x7fefffff,#-1) 471 A = CTMP 472 } 473 { 474 ATMP = combine(##0x7ff00000,#0) 475 BH = extractu(TMP,#2,#SR_ROUND_OFF) 476 TMP = or(TMP,#0x28) 477 } 478 { 479 USR = TMP 480 BH ^= lsr(AH,#31) 481 BL = BH 482 } 483 { 484 p0 = !cmp.eq(BL,#1) 485 p0 = !cmp.eq(BH,#2) 486 } 487 { 488 p0 = dfcmp.eq(ATMP,ATMP) 489 if (p0.new) CTMP = ATMP 490 } 491 { 492 A = insert(CTMP,#63,#0) 493 dealloc_return 494 } 495#undef CTMP 496#undef CTMPH 497#undef CTMPL 498#define BTMP r11:10 499#define BTMPH r11 500#define BTMPL r10 501 502#undef STICKIES 503#undef STICKIESH 504#undef STICKIESL 505#define C r5:4 506#define CH r5 507#define CL r4 508 509.Lfma_abnormal_ab: 510 { 511 ATMP = extractu(A,#63,#0) 512 BTMP = extractu(B,#63,#0) 513 deallocframe 514 } 515 { 516 p3 = cmp.gtu(ATMP,BTMP) 517 if (!p3.new) A = B // sort values 518 if (!p3.new) B = A 519 } 520 { 521 p0 = dfclass(A,#0x0f) // A NaN? 522 if (!p0.new) jump:nt .Lnan 523 if (!p3) ATMP = BTMP 524 if (!p3) BTMP = ATMP 525 } 526 { 527 p1 = dfclass(A,#0x08) // A is infinity 528 p1 = dfclass(B,#0x0e) // B is nonzero 529 } 530 { 531 p0 = dfclass(A,#0x08) // a is inf 532 p0 = dfclass(B,#0x01) // b is zero 533 } 534 { 535 if (p1) jump .Lab_inf 536 p2 = dfclass(B,#0x01) 537 } 538 { 539 if (p0) jump .Linvalid 540 if (p2) jump .Lab_true_zero 541 TMP = ##0x7c000000 542 } 543 // We are left with a normal or subnormal times a subnormal, A > B 544 // If A and B are both very small, we will go to a single sticky bit; replace 545 // A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results 546 // if A and B might multiply to something bigger, decrease A exp and increase B exp 547 // and start over 548 { 549 p0 = bitsclr(AH,TMP) 550 if (p0.new) jump:nt .Lfma_ab_tiny 551 } 552 { 553 TMP = add(clb(BTMP),#-EXPBITS) 554 } 555 { 556 BTMP = asl(BTMP,TMP) 557 } 558 { 559 B = insert(BTMP,#63,#0) 560 AH -= asl(TMP,#HI_MANTBITS) 561 } 562 jump .Lfma_begin 563 564.Lfma_ab_tiny: 565 ATMP = combine(##0x00100000,#0) 566 { 567 A = insert(ATMP,#63,#0) 568 B = insert(ATMP,#63,#0) 569 } 570 jump .Lfma_begin 571 572.Lab_inf: 573 { 574 B = lsr(B,#63) 575 p0 = dfclass(C,#0x10) 576 } 577 { 578 A ^= asl(B,#63) 579 if (p0) jump .Lnan 580 } 581 { 582 p1 = dfclass(C,#0x08) 583 if (p1.new) jump:nt .Lfma_inf_plus_inf 584 } 585 // A*B is +/- inf, C is finite. Return A 586 { 587 jumpr r31 588 } 589 .falign 590.Lfma_inf_plus_inf: 591 { // adding infinities of different signs is invalid 592 p0 = dfcmp.eq(A,C) 593 if (!p0.new) jump:nt .Linvalid 594 } 595 { 596 jumpr r31 597 } 598 599.Lnan: 600 { 601 p0 = dfclass(B,#0x10) 602 p1 = dfclass(C,#0x10) 603 if (!p0.new) B = A 604 if (!p1.new) C = A 605 } 606 { // find sNaNs 607 BH = convert_df2sf(B) 608 BL = convert_df2sf(C) 609 } 610 { 611 BH = convert_df2sf(A) 612 A = #-1 613 jumpr r31 614 } 615 616.Linvalid: 617 { 618 TMP = ##0x7f800001 // sp snan 619 } 620 { 621 A = convert_sf2df(TMP) 622 jumpr r31 623 } 624 625.Lab_true_zero: 626 // B is zero, A is finite number 627 { 628 p0 = dfclass(C,#0x10) 629 if (p0.new) jump:nt .Lnan 630 if (p0.new) A = C 631 } 632 { 633 p0 = dfcmp.eq(B,C) // is C also zero? 634 AH = lsr(AH,#31) // get sign 635 } 636 { 637 BH ^= asl(AH,#31) // form correctly signed zero in B 638 if (!p0) A = C // If C is not zero, return C 639 if (!p0) jumpr r31 640 } 641 // B has correctly signed zero, C is also zero 642.Lzero_plus_zero: 643 { 644 p0 = cmp.eq(B,C) // yes, scalar equals. +0++0 or -0+-0 645 if (p0.new) jumpr:t r31 646 A = B 647 } 648 { 649 TMP = USR 650 } 651 { 652 TMP = extractu(TMP,#2,#SR_ROUND_OFF) 653 A = #0 654 } 655 { 656 p0 = cmp.eq(TMP,#2) 657 if (p0.new) AH = ##0x80000000 658 jumpr r31 659 } 660#undef BTMP 661#undef BTMPH 662#undef BTMPL 663#define CTMP r11:10 664 .falign 665.Lfma_abnormal_c: 666 // We know that AB is normal * normal 667 // C is not normal: zero, subnormal, inf, or NaN. 668 { 669 p0 = dfclass(C,#0x10) // is C NaN? 670 if (p0.new) jump:nt .Lnan 671 if (p0.new) A = C // move NaN to A 672 deallocframe 673 } 674 { 675 p0 = dfclass(C,#0x08) // is C inf? 676 if (p0.new) A = C // return C 677 if (p0.new) jumpr:nt r31 678 } 679 // zero or subnormal 680 // If we have a zero, and we know AB is normal*normal, we can just call normal multiply 681 { 682 p0 = dfclass(C,#0x01) // is C zero? 683 if (p0.new) jump:nt __hexagon_muldf3 684 TMP = #1 685 } 686 // Left with: subnormal 687 // Adjust C and jump back to restart 688 { 689 allocframe(#STACKSPACE) // oops, deallocated above, re-allocate frame 690 CTMP = #0 691 CH = insert(TMP,#EXPBITS,#HI_MANTBITS) 692 jump .Lfma_abnormal_c_restart 693 } 694END(fma) 695