1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for PPC64. 18# 19# August 2016. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# POWER7 +260-530% 26# POWER8 +220-340% 27 28$flavour = shift; 29while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 30 31$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 32( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 33( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 34die "can't locate ppc-xlate.pl"; 35 36open OUT,"| \"$^X\" $xlate $flavour $output"; 37*STDOUT=*OUT; 38 39my $sp="r1"; 40 41{ 42my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3, 43 $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) = 44 map("r$_",(3..12,22..31)); 45 46my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont 47 48$code.=<<___; 49.machine "any" 50.text 51___ 52######################################################################## 53# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 54# 55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 56open TABLE,"<ecp_nistz256_table.c" or 57open TABLE,"<${dir}../ecp_nistz256_table.c" or 58die "failed to open ecp_nistz256_table.c:",$!; 59 60use integer; 61 62foreach(<TABLE>) { 63 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 64} 65close TABLE; 66 67# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 68# 64*16*37-1 is because $#arr returns last valid index or @arr, not 69# amount of elements. 70die "insane number of elements" if ($#arr != 64*16*37-1); 71 72$code.=<<___; 73.type ecp_nistz256_precomputed,\@object 74.globl ecp_nistz256_precomputed 75.align 12 76ecp_nistz256_precomputed: 77___ 78######################################################################## 79# this conversion smashes P256_POINT_AFFINE by individual bytes with 80# 64 byte interval, similar to 81# 1111222233334444 82# 1234123412341234 83for(1..37) { 84 @tbl = splice(@arr,0,64*16); 85 for($i=0;$i<64;$i++) { 86 undef @line; 87 for($j=0;$j<64;$j++) { 88 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 89 } 90 $code.=".byte\t"; 91 $code.=join(',',map { sprintf "0x%02x",$_} @line); 92 $code.="\n"; 93 } 94} 95 96$code.=<<___; 97.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 98.asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>" 99 100# void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 101# const BN_ULONG x2[4]); 102.globl ecp_nistz256_mul_mont 103.align 5 104ecp_nistz256_mul_mont: 105 stdu $sp,-128($sp) 106 mflr r0 107 std r22,48($sp) 108 std r23,56($sp) 109 std r24,64($sp) 110 std r25,72($sp) 111 std r26,80($sp) 112 std r27,88($sp) 113 std r28,96($sp) 114 std r29,104($sp) 115 std r30,112($sp) 116 std r31,120($sp) 117 118 ld $a0,0($ap) 119 ld $bi,0($bp) 120 ld $a1,8($ap) 121 ld $a2,16($ap) 122 ld $a3,24($ap) 123 124 li $poly1,-1 125 srdi $poly1,$poly1,32 # 0x00000000ffffffff 126 li $poly3,1 127 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 128 129 bl __ecp_nistz256_mul_mont 130 131 mtlr r0 132 ld r22,48($sp) 133 ld r23,56($sp) 134 ld r24,64($sp) 135 ld r25,72($sp) 136 ld r26,80($sp) 137 ld r27,88($sp) 138 ld r28,96($sp) 139 ld r29,104($sp) 140 ld r30,112($sp) 141 ld r31,120($sp) 142 addi $sp,$sp,128 143 blr 144 .long 0 145 .byte 0,12,4,0,0x80,10,3,0 146 .long 0 147.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 148 149# void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 150.globl ecp_nistz256_sqr_mont 151.align 4 152ecp_nistz256_sqr_mont: 153 stdu $sp,-128($sp) 154 mflr r0 155 std r22,48($sp) 156 std r23,56($sp) 157 std r24,64($sp) 158 std r25,72($sp) 159 std r26,80($sp) 160 std r27,88($sp) 161 std r28,96($sp) 162 std r29,104($sp) 163 std r30,112($sp) 164 std r31,120($sp) 165 166 ld $a0,0($ap) 167 ld $a1,8($ap) 168 ld $a2,16($ap) 169 ld $a3,24($ap) 170 171 li $poly1,-1 172 srdi $poly1,$poly1,32 # 0x00000000ffffffff 173 li $poly3,1 174 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 175 176 bl __ecp_nistz256_sqr_mont 177 178 mtlr r0 179 ld r22,48($sp) 180 ld r23,56($sp) 181 ld r24,64($sp) 182 ld r25,72($sp) 183 ld r26,80($sp) 184 ld r27,88($sp) 185 ld r28,96($sp) 186 ld r29,104($sp) 187 ld r30,112($sp) 188 ld r31,120($sp) 189 addi $sp,$sp,128 190 blr 191 .long 0 192 .byte 0,12,4,0,0x80,10,2,0 193 .long 0 194.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 195 196# void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 197# const BN_ULONG x2[4]); 198.globl ecp_nistz256_add 199.align 4 200ecp_nistz256_add: 201 stdu $sp,-128($sp) 202 mflr r0 203 std r28,96($sp) 204 std r29,104($sp) 205 std r30,112($sp) 206 std r31,120($sp) 207 208 ld $acc0,0($ap) 209 ld $t0, 0($bp) 210 ld $acc1,8($ap) 211 ld $t1, 8($bp) 212 ld $acc2,16($ap) 213 ld $t2, 16($bp) 214 ld $acc3,24($ap) 215 ld $t3, 24($bp) 216 217 li $poly1,-1 218 srdi $poly1,$poly1,32 # 0x00000000ffffffff 219 li $poly3,1 220 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 221 222 bl __ecp_nistz256_add 223 224 mtlr r0 225 ld r28,96($sp) 226 ld r29,104($sp) 227 ld r30,112($sp) 228 ld r31,120($sp) 229 addi $sp,$sp,128 230 blr 231 .long 0 232 .byte 0,12,4,0,0x80,4,3,0 233 .long 0 234.size ecp_nistz256_add,.-ecp_nistz256_add 235 236# void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 237.globl ecp_nistz256_div_by_2 238.align 4 239ecp_nistz256_div_by_2: 240 stdu $sp,-128($sp) 241 mflr r0 242 std r28,96($sp) 243 std r29,104($sp) 244 std r30,112($sp) 245 std r31,120($sp) 246 247 ld $acc0,0($ap) 248 ld $acc1,8($ap) 249 ld $acc2,16($ap) 250 ld $acc3,24($ap) 251 252 li $poly1,-1 253 srdi $poly1,$poly1,32 # 0x00000000ffffffff 254 li $poly3,1 255 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 256 257 bl __ecp_nistz256_div_by_2 258 259 mtlr r0 260 ld r28,96($sp) 261 ld r29,104($sp) 262 ld r30,112($sp) 263 ld r31,120($sp) 264 addi $sp,$sp,128 265 blr 266 .long 0 267 .byte 0,12,4,0,0x80,4,2,0 268 .long 0 269.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 270 271# void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 272.globl ecp_nistz256_mul_by_2 273.align 4 274ecp_nistz256_mul_by_2: 275 stdu $sp,-128($sp) 276 mflr r0 277 std r28,96($sp) 278 std r29,104($sp) 279 std r30,112($sp) 280 std r31,120($sp) 281 282 ld $acc0,0($ap) 283 ld $acc1,8($ap) 284 ld $acc2,16($ap) 285 ld $acc3,24($ap) 286 287 mr $t0,$acc0 288 mr $t1,$acc1 289 mr $t2,$acc2 290 mr $t3,$acc3 291 292 li $poly1,-1 293 srdi $poly1,$poly1,32 # 0x00000000ffffffff 294 li $poly3,1 295 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 296 297 bl __ecp_nistz256_add # ret = a+a // 2*a 298 299 mtlr r0 300 ld r28,96($sp) 301 ld r29,104($sp) 302 ld r30,112($sp) 303 ld r31,120($sp) 304 addi $sp,$sp,128 305 blr 306 .long 0 307 .byte 0,12,4,0,0x80,4,3,0 308 .long 0 309.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 310 311# void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 312.globl ecp_nistz256_mul_by_3 313.align 4 314ecp_nistz256_mul_by_3: 315 stdu $sp,-128($sp) 316 mflr r0 317 std r28,96($sp) 318 std r29,104($sp) 319 std r30,112($sp) 320 std r31,120($sp) 321 322 ld $acc0,0($ap) 323 ld $acc1,8($ap) 324 ld $acc2,16($ap) 325 ld $acc3,24($ap) 326 327 mr $t0,$acc0 328 std $acc0,64($sp) 329 mr $t1,$acc1 330 std $acc1,72($sp) 331 mr $t2,$acc2 332 std $acc2,80($sp) 333 mr $t3,$acc3 334 std $acc3,88($sp) 335 336 li $poly1,-1 337 srdi $poly1,$poly1,32 # 0x00000000ffffffff 338 li $poly3,1 339 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 340 341 bl __ecp_nistz256_add # ret = a+a // 2*a 342 343 ld $t0,64($sp) 344 ld $t1,72($sp) 345 ld $t2,80($sp) 346 ld $t3,88($sp) 347 348 bl __ecp_nistz256_add # ret += a // 2*a+a=3*a 349 350 mtlr r0 351 ld r28,96($sp) 352 ld r29,104($sp) 353 ld r30,112($sp) 354 ld r31,120($sp) 355 addi $sp,$sp,128 356 blr 357 .long 0 358 .byte 0,12,4,0,0x80,4,2,0 359 .long 0 360.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 361 362# void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 363# const BN_ULONG x2[4]); 364.globl ecp_nistz256_sub 365.align 4 366ecp_nistz256_sub: 367 stdu $sp,-128($sp) 368 mflr r0 369 std r28,96($sp) 370 std r29,104($sp) 371 std r30,112($sp) 372 std r31,120($sp) 373 374 ld $acc0,0($ap) 375 ld $acc1,8($ap) 376 ld $acc2,16($ap) 377 ld $acc3,24($ap) 378 379 li $poly1,-1 380 srdi $poly1,$poly1,32 # 0x00000000ffffffff 381 li $poly3,1 382 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 383 384 bl __ecp_nistz256_sub_from 385 386 mtlr r0 387 ld r28,96($sp) 388 ld r29,104($sp) 389 ld r30,112($sp) 390 ld r31,120($sp) 391 addi $sp,$sp,128 392 blr 393 .long 0 394 .byte 0,12,4,0,0x80,4,3,0 395 .long 0 396.size ecp_nistz256_sub,.-ecp_nistz256_sub 397 398# void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 399.globl ecp_nistz256_neg 400.align 4 401ecp_nistz256_neg: 402 stdu $sp,-128($sp) 403 mflr r0 404 std r28,96($sp) 405 std r29,104($sp) 406 std r30,112($sp) 407 std r31,120($sp) 408 409 mr $bp,$ap 410 li $acc0,0 411 li $acc1,0 412 li $acc2,0 413 li $acc3,0 414 415 li $poly1,-1 416 srdi $poly1,$poly1,32 # 0x00000000ffffffff 417 li $poly3,1 418 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 419 420 bl __ecp_nistz256_sub_from 421 422 mtlr r0 423 ld r28,96($sp) 424 ld r29,104($sp) 425 ld r30,112($sp) 426 ld r31,120($sp) 427 addi $sp,$sp,128 428 blr 429 .long 0 430 .byte 0,12,4,0,0x80,4,2,0 431 .long 0 432.size ecp_nistz256_neg,.-ecp_nistz256_neg 433 434# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 435# to $a0-$a3 and b[0] - to $bi 436.type __ecp_nistz256_mul_mont,\@function 437.align 4 438__ecp_nistz256_mul_mont: 439 mulld $acc0,$a0,$bi # a[0]*b[0] 440 mulhdu $t0,$a0,$bi 441 442 mulld $acc1,$a1,$bi # a[1]*b[0] 443 mulhdu $t1,$a1,$bi 444 445 mulld $acc2,$a2,$bi # a[2]*b[0] 446 mulhdu $t2,$a2,$bi 447 448 mulld $acc3,$a3,$bi # a[3]*b[0] 449 mulhdu $t3,$a3,$bi 450 ld $bi,8($bp) # b[1] 451 452 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication 453 sldi $t0,$acc0,32 454 adde $acc2,$acc2,$t1 455 srdi $t1,$acc0,32 456 adde $acc3,$acc3,$t2 457 addze $acc4,$t3 458 li $acc5,0 459___ 460for($i=1;$i<4;$i++) { 461 ################################################################ 462 # Reduction iteration is normally performed by accumulating 463 # result of multiplication of modulus by "magic" digit [and 464 # omitting least significant word, which is guaranteed to 465 # be 0], but thanks to special form of modulus and "magic" 466 # digit being equal to least significant word, it can be 467 # performed with additions and subtractions alone. Indeed: 468 # 469 # ffff0001.00000000.0000ffff.ffffffff 470 # * abcdefgh 471 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 472 # 473 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 474 # rewrite above as: 475 # 476 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 477 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 478 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 479 # 480 # or marking redundant operations: 481 # 482 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 483 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 484 # - 0000abcd.efgh0000.--------.--------.-------- 485 486$code.=<<___; 487 subfc $t2,$t0,$acc0 # "*0xffff0001" 488 subfe $t3,$t1,$acc0 489 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 490 adde $acc1,$acc2,$t1 491 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 492 adde $acc3,$acc4,$t3 493 addze $acc4,$acc5 494 495 mulld $t0,$a0,$bi # lo(a[0]*b[i]) 496 mulld $t1,$a1,$bi # lo(a[1]*b[i]) 497 mulld $t2,$a2,$bi # lo(a[2]*b[i]) 498 mulld $t3,$a3,$bi # lo(a[3]*b[i]) 499 addc $acc0,$acc0,$t0 # accumulate low parts of multiplication 500 mulhdu $t0,$a0,$bi # hi(a[0]*b[i]) 501 adde $acc1,$acc1,$t1 502 mulhdu $t1,$a1,$bi # hi(a[1]*b[i]) 503 adde $acc2,$acc2,$t2 504 mulhdu $t2,$a2,$bi # hi(a[2]*b[i]) 505 adde $acc3,$acc3,$t3 506 mulhdu $t3,$a3,$bi # hi(a[3]*b[i]) 507 addze $acc4,$acc4 508___ 509$code.=<<___ if ($i<3); 510 ld $bi,8*($i+1)($bp) # b[$i+1] 511___ 512$code.=<<___; 513 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication 514 sldi $t0,$acc0,32 515 adde $acc2,$acc2,$t1 516 srdi $t1,$acc0,32 517 adde $acc3,$acc3,$t2 518 adde $acc4,$acc4,$t3 519 li $acc5,0 520 addze $acc5,$acc5 521___ 522} 523$code.=<<___; 524 # last reduction 525 subfc $t2,$t0,$acc0 # "*0xffff0001" 526 subfe $t3,$t1,$acc0 527 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 528 adde $acc1,$acc2,$t1 529 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 530 adde $acc3,$acc4,$t3 531 addze $acc4,$acc5 532 533 li $t2,0 534 addic $acc0,$acc0,1 # ret -= modulus 535 subfe $acc1,$poly1,$acc1 536 subfe $acc2,$t2,$acc2 537 subfe $acc3,$poly3,$acc3 538 subfe $acc4,$t2,$acc4 539 540 addc $acc0,$acc0,$acc4 # ret += modulus if borrow 541 and $t1,$poly1,$acc4 542 and $t3,$poly3,$acc4 543 adde $acc1,$acc1,$t1 544 addze $acc2,$acc2 545 adde $acc3,$acc3,$t3 546 547 std $acc0,0($rp) 548 std $acc1,8($rp) 549 std $acc2,16($rp) 550 std $acc3,24($rp) 551 552 blr 553 .long 0 554 .byte 0,12,0x14,0,0,0,1,0 555 .long 0 556.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 557 558# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 559# to $a0-$a3 560.type __ecp_nistz256_sqr_mont,\@function 561.align 4 562__ecp_nistz256_sqr_mont: 563 ################################################################ 564 # | | | | | |a1*a0| | 565 # | | | | |a2*a0| | | 566 # | |a3*a2|a3*a0| | | | 567 # | | | |a2*a1| | | | 568 # | | |a3*a1| | | | | 569 # *| | | | | | | | 2| 570 # +|a3*a3|a2*a2|a1*a1|a0*a0| 571 # |--+--+--+--+--+--+--+--| 572 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 573 # 574 # "can't overflow" below mark carrying into high part of 575 # multiplication result, which can't overflow, because it 576 # can never be all ones. 577 578 mulld $acc1,$a1,$a0 # a[1]*a[0] 579 mulhdu $t1,$a1,$a0 580 mulld $acc2,$a2,$a0 # a[2]*a[0] 581 mulhdu $t2,$a2,$a0 582 mulld $acc3,$a3,$a0 # a[3]*a[0] 583 mulhdu $acc4,$a3,$a0 584 585 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication 586 mulld $t0,$a2,$a1 # a[2]*a[1] 587 mulhdu $t1,$a2,$a1 588 adde $acc3,$acc3,$t2 589 mulld $t2,$a3,$a1 # a[3]*a[1] 590 mulhdu $t3,$a3,$a1 591 addze $acc4,$acc4 # can't overflow 592 593 mulld $acc5,$a3,$a2 # a[3]*a[2] 594 mulhdu $acc6,$a3,$a2 595 596 addc $t1,$t1,$t2 # accumulate high parts of multiplication 597 addze $t2,$t3 # can't overflow 598 599 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication 600 adde $acc4,$acc4,$t1 601 adde $acc5,$acc5,$t2 602 addze $acc6,$acc6 # can't overflow 603 604 addc $acc1,$acc1,$acc1 # acc[1-6]*=2 605 adde $acc2,$acc2,$acc2 606 adde $acc3,$acc3,$acc3 607 adde $acc4,$acc4,$acc4 608 adde $acc5,$acc5,$acc5 609 adde $acc6,$acc6,$acc6 610 li $acc7,0 611 addze $acc7,$acc7 612 613 mulld $acc0,$a0,$a0 # a[0]*a[0] 614 mulhdu $a0,$a0,$a0 615 mulld $t1,$a1,$a1 # a[1]*a[1] 616 mulhdu $a1,$a1,$a1 617 mulld $t2,$a2,$a2 # a[2]*a[2] 618 mulhdu $a2,$a2,$a2 619 mulld $t3,$a3,$a3 # a[3]*a[3] 620 mulhdu $a3,$a3,$a3 621 addc $acc1,$acc1,$a0 # +a[i]*a[i] 622 sldi $t0,$acc0,32 623 adde $acc2,$acc2,$t1 624 srdi $t1,$acc0,32 625 adde $acc3,$acc3,$a1 626 adde $acc4,$acc4,$t2 627 adde $acc5,$acc5,$a2 628 adde $acc6,$acc6,$t3 629 adde $acc7,$acc7,$a3 630___ 631for($i=0;$i<3;$i++) { # reductions, see commentary in 632 # multiplication for details 633$code.=<<___; 634 subfc $t2,$t0,$acc0 # "*0xffff0001" 635 subfe $t3,$t1,$acc0 636 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 637 sldi $t0,$acc0,32 638 adde $acc1,$acc2,$t1 639 srdi $t1,$acc0,32 640 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 641 addze $acc3,$t3 # can't overflow 642___ 643} 644$code.=<<___; 645 subfc $t2,$t0,$acc0 # "*0xffff0001" 646 subfe $t3,$t1,$acc0 647 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] 648 adde $acc1,$acc2,$t1 649 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 650 addze $acc3,$t3 # can't overflow 651 652 addc $acc0,$acc0,$acc4 # accumulate upper half 653 adde $acc1,$acc1,$acc5 654 adde $acc2,$acc2,$acc6 655 adde $acc3,$acc3,$acc7 656 li $t2,0 657 addze $acc4,$t2 658 659 addic $acc0,$acc0,1 # ret -= modulus 660 subfe $acc1,$poly1,$acc1 661 subfe $acc2,$t2,$acc2 662 subfe $acc3,$poly3,$acc3 663 subfe $acc4,$t2,$acc4 664 665 addc $acc0,$acc0,$acc4 # ret += modulus if borrow 666 and $t1,$poly1,$acc4 667 and $t3,$poly3,$acc4 668 adde $acc1,$acc1,$t1 669 addze $acc2,$acc2 670 adde $acc3,$acc3,$t3 671 672 std $acc0,0($rp) 673 std $acc1,8($rp) 674 std $acc2,16($rp) 675 std $acc3,24($rp) 676 677 blr 678 .long 0 679 .byte 0,12,0x14,0,0,0,1,0 680 .long 0 681.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 682 683# Note that __ecp_nistz256_add expects both input vectors pre-loaded to 684# $a0-$a3 and $t0-$t3. This is done because it's used in multiple 685# contexts, e.g. in multiplication by 2 and 3... 686.type __ecp_nistz256_add,\@function 687.align 4 688__ecp_nistz256_add: 689 addc $acc0,$acc0,$t0 # ret = a+b 690 adde $acc1,$acc1,$t1 691 adde $acc2,$acc2,$t2 692 li $t2,0 693 adde $acc3,$acc3,$t3 694 addze $t0,$t2 695 696 # if a+b >= modulus, subtract modulus 697 # 698 # But since comparison implies subtraction, we subtract 699 # modulus and then add it back if subtraction borrowed. 700 701 subic $acc0,$acc0,-1 702 subfe $acc1,$poly1,$acc1 703 subfe $acc2,$t2,$acc2 704 subfe $acc3,$poly3,$acc3 705 subfe $t0,$t2,$t0 706 707 addc $acc0,$acc0,$t0 708 and $t1,$poly1,$t0 709 and $t3,$poly3,$t0 710 adde $acc1,$acc1,$t1 711 addze $acc2,$acc2 712 adde $acc3,$acc3,$t3 713 714 std $acc0,0($rp) 715 std $acc1,8($rp) 716 std $acc2,16($rp) 717 std $acc3,24($rp) 718 719 blr 720 .long 0 721 .byte 0,12,0x14,0,0,0,3,0 722 .long 0 723.size __ecp_nistz256_add,.-__ecp_nistz256_add 724 725.type __ecp_nistz256_sub_from,\@function 726.align 4 727__ecp_nistz256_sub_from: 728 ld $t0,0($bp) 729 ld $t1,8($bp) 730 ld $t2,16($bp) 731 ld $t3,24($bp) 732 subfc $acc0,$t0,$acc0 # ret = a-b 733 subfe $acc1,$t1,$acc1 734 subfe $acc2,$t2,$acc2 735 subfe $acc3,$t3,$acc3 736 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 737 738 # if a-b borrowed, add modulus 739 740 addc $acc0,$acc0,$t0 # ret -= modulus & t0 741 and $t1,$poly1,$t0 742 and $t3,$poly3,$t0 743 adde $acc1,$acc1,$t1 744 addze $acc2,$acc2 745 adde $acc3,$acc3,$t3 746 747 std $acc0,0($rp) 748 std $acc1,8($rp) 749 std $acc2,16($rp) 750 std $acc3,24($rp) 751 752 blr 753 .long 0 754 .byte 0,12,0x14,0,0,0,3,0 755 .long 0 756.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 757 758.type __ecp_nistz256_sub_morf,\@function 759.align 4 760__ecp_nistz256_sub_morf: 761 ld $t0,0($bp) 762 ld $t1,8($bp) 763 ld $t2,16($bp) 764 ld $t3,24($bp) 765 subfc $acc0,$acc0,$t0 # ret = b-a 766 subfe $acc1,$acc1,$t1 767 subfe $acc2,$acc2,$t2 768 subfe $acc3,$acc3,$t3 769 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 770 771 # if b-a borrowed, add modulus 772 773 addc $acc0,$acc0,$t0 # ret -= modulus & t0 774 and $t1,$poly1,$t0 775 and $t3,$poly3,$t0 776 adde $acc1,$acc1,$t1 777 addze $acc2,$acc2 778 adde $acc3,$acc3,$t3 779 780 std $acc0,0($rp) 781 std $acc1,8($rp) 782 std $acc2,16($rp) 783 std $acc3,24($rp) 784 785 blr 786 .long 0 787 .byte 0,12,0x14,0,0,0,3,0 788 .long 0 789.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 790 791.type __ecp_nistz256_div_by_2,\@function 792.align 4 793__ecp_nistz256_div_by_2: 794 andi. $t0,$acc0,1 795 addic $acc0,$acc0,-1 # a += modulus 796 neg $t0,$t0 797 adde $acc1,$acc1,$poly1 798 not $t0,$t0 799 addze $acc2,$acc2 800 li $t2,0 801 adde $acc3,$acc3,$poly3 802 and $t1,$poly1,$t0 803 addze $ap,$t2 # ap = carry 804 and $t3,$poly3,$t0 805 806 subfc $acc0,$t0,$acc0 # a -= modulus if a was even 807 subfe $acc1,$t1,$acc1 808 subfe $acc2,$t2,$acc2 809 subfe $acc3,$t3,$acc3 810 subfe $ap, $t2,$ap 811 812 srdi $acc0,$acc0,1 813 sldi $t0,$acc1,63 814 srdi $acc1,$acc1,1 815 sldi $t1,$acc2,63 816 srdi $acc2,$acc2,1 817 sldi $t2,$acc3,63 818 srdi $acc3,$acc3,1 819 sldi $t3,$ap,63 820 or $acc0,$acc0,$t0 821 or $acc1,$acc1,$t1 822 or $acc2,$acc2,$t2 823 or $acc3,$acc3,$t3 824 825 std $acc0,0($rp) 826 std $acc1,8($rp) 827 std $acc2,16($rp) 828 std $acc3,24($rp) 829 830 blr 831 .long 0 832 .byte 0,12,0x14,0,0,0,1,0 833 .long 0 834.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 835___ 836######################################################################## 837# following subroutines are "literal" implementation of those found in 838# ecp_nistz256.c 839# 840######################################################################## 841# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 842# 843if (1) { 844my $FRAME=64+32*4+12*8; 845my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3)); 846# above map() describes stack layout with 4 temporary 847# 256-bit vectors on top. 848my ($rp_real,$ap_real) = map("r$_",(20,21)); 849 850$code.=<<___; 851.globl ecp_nistz256_point_double 852.align 5 853ecp_nistz256_point_double: 854 stdu $sp,-$FRAME($sp) 855 mflr r0 856 std r20,$FRAME-8*12($sp) 857 std r21,$FRAME-8*11($sp) 858 std r22,$FRAME-8*10($sp) 859 std r23,$FRAME-8*9($sp) 860 std r24,$FRAME-8*8($sp) 861 std r25,$FRAME-8*7($sp) 862 std r26,$FRAME-8*6($sp) 863 std r27,$FRAME-8*5($sp) 864 std r28,$FRAME-8*4($sp) 865 std r29,$FRAME-8*3($sp) 866 std r30,$FRAME-8*2($sp) 867 std r31,$FRAME-8*1($sp) 868 869 li $poly1,-1 870 srdi $poly1,$poly1,32 # 0x00000000ffffffff 871 li $poly3,1 872 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 873.Ldouble_shortcut: 874 ld $acc0,32($ap) 875 ld $acc1,40($ap) 876 ld $acc2,48($ap) 877 ld $acc3,56($ap) 878 mr $t0,$acc0 879 mr $t1,$acc1 880 mr $t2,$acc2 881 mr $t3,$acc3 882 ld $a0,64($ap) # forward load for p256_sqr_mont 883 ld $a1,72($ap) 884 ld $a2,80($ap) 885 ld $a3,88($ap) 886 mr $rp_real,$rp 887 mr $ap_real,$ap 888 addi $rp,$sp,$S 889 bl __ecp_nistz256_add # p256_mul_by_2(S, in_y); 890 891 addi $rp,$sp,$Zsqr 892 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z); 893 894 ld $t0,0($ap_real) 895 ld $t1,8($ap_real) 896 ld $t2,16($ap_real) 897 ld $t3,24($ap_real) 898 mr $a0,$acc0 # put Zsqr aside for p256_sub 899 mr $a1,$acc1 900 mr $a2,$acc2 901 mr $a3,$acc3 902 addi $rp,$sp,$M 903 bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x); 904 905 addi $bp,$ap_real,0 906 mr $acc0,$a0 # restore Zsqr 907 mr $acc1,$a1 908 mr $acc2,$a2 909 mr $acc3,$a3 910 ld $a0,$S+0($sp) # forward load for p256_sqr_mont 911 ld $a1,$S+8($sp) 912 ld $a2,$S+16($sp) 913 ld $a3,$S+24($sp) 914 addi $rp,$sp,$Zsqr 915 bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr); 916 917 addi $rp,$sp,$S 918 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S); 919 920 ld $bi,32($ap_real) 921 ld $a0,64($ap_real) 922 ld $a1,72($ap_real) 923 ld $a2,80($ap_real) 924 ld $a3,88($ap_real) 925 addi $bp,$ap_real,32 926 addi $rp,$sp,$tmp0 927 bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y); 928 929 mr $t0,$acc0 930 mr $t1,$acc1 931 mr $t2,$acc2 932 mr $t3,$acc3 933 ld $a0,$S+0($sp) # forward load for p256_sqr_mont 934 ld $a1,$S+8($sp) 935 ld $a2,$S+16($sp) 936 ld $a3,$S+24($sp) 937 addi $rp,$rp_real,64 938 bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0); 939 940 addi $rp,$sp,$tmp0 941 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S); 942 943 ld $bi,$Zsqr($sp) # forward load for p256_mul_mont 944 ld $a0,$M+0($sp) 945 ld $a1,$M+8($sp) 946 ld $a2,$M+16($sp) 947 ld $a3,$M+24($sp) 948 addi $rp,$rp_real,32 949 bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0); 950 951 addi $bp,$sp,$Zsqr 952 addi $rp,$sp,$M 953 bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr); 954 955 mr $t0,$acc0 # duplicate M 956 mr $t1,$acc1 957 mr $t2,$acc2 958 mr $t3,$acc3 959 mr $a0,$acc0 # put M aside 960 mr $a1,$acc1 961 mr $a2,$acc2 962 mr $a3,$acc3 963 addi $rp,$sp,$M 964 bl __ecp_nistz256_add 965 mr $t0,$a0 # restore M 966 mr $t1,$a1 967 mr $t2,$a2 968 mr $t3,$a3 969 ld $bi,0($ap_real) # forward load for p256_mul_mont 970 ld $a0,$S+0($sp) 971 ld $a1,$S+8($sp) 972 ld $a2,$S+16($sp) 973 ld $a3,$S+24($sp) 974 bl __ecp_nistz256_add # p256_mul_by_3(M, M); 975 976 addi $bp,$ap_real,0 977 addi $rp,$sp,$S 978 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x); 979 980 mr $t0,$acc0 981 mr $t1,$acc1 982 mr $t2,$acc2 983 mr $t3,$acc3 984 ld $a0,$M+0($sp) # forward load for p256_sqr_mont 985 ld $a1,$M+8($sp) 986 ld $a2,$M+16($sp) 987 ld $a3,$M+24($sp) 988 addi $rp,$sp,$tmp0 989 bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S); 990 991 addi $rp,$rp_real,0 992 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M); 993 994 addi $bp,$sp,$tmp0 995 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0); 996 997 addi $bp,$sp,$S 998 addi $rp,$sp,$S 999 bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x); 1000 1001 ld $bi,$M($sp) 1002 mr $a0,$acc0 # copy S 1003 mr $a1,$acc1 1004 mr $a2,$acc2 1005 mr $a3,$acc3 1006 addi $bp,$sp,$M 1007 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M); 1008 1009 addi $bp,$rp_real,32 1010 addi $rp,$rp_real,32 1011 bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y); 1012 1013 mtlr r0 1014 ld r20,$FRAME-8*12($sp) 1015 ld r21,$FRAME-8*11($sp) 1016 ld r22,$FRAME-8*10($sp) 1017 ld r23,$FRAME-8*9($sp) 1018 ld r24,$FRAME-8*8($sp) 1019 ld r25,$FRAME-8*7($sp) 1020 ld r26,$FRAME-8*6($sp) 1021 ld r27,$FRAME-8*5($sp) 1022 ld r28,$FRAME-8*4($sp) 1023 ld r29,$FRAME-8*3($sp) 1024 ld r30,$FRAME-8*2($sp) 1025 ld r31,$FRAME-8*1($sp) 1026 addi $sp,$sp,$FRAME 1027 blr 1028 .long 0 1029 .byte 0,12,4,0,0x80,12,2,0 1030 .long 0 1031.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 1032___ 1033} 1034 1035######################################################################## 1036# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1037# const P256_POINT *in2); 1038if (1) { 1039my $FRAME = 64 + 32*12 + 16*8; 1040my ($res_x,$res_y,$res_z, 1041 $H,$Hsqr,$R,$Rsqr,$Hcub, 1042 $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11)); 1043my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1044# above map() describes stack layout with 12 temporary 1045# 256-bit vectors on top. 1046my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); 1047 1048$code.=<<___; 1049.globl ecp_nistz256_point_add 1050.align 5 1051ecp_nistz256_point_add: 1052 stdu $sp,-$FRAME($sp) 1053 mflr r0 1054 std r16,$FRAME-8*16($sp) 1055 std r17,$FRAME-8*15($sp) 1056 std r18,$FRAME-8*14($sp) 1057 std r19,$FRAME-8*13($sp) 1058 std r20,$FRAME-8*12($sp) 1059 std r21,$FRAME-8*11($sp) 1060 std r22,$FRAME-8*10($sp) 1061 std r23,$FRAME-8*9($sp) 1062 std r24,$FRAME-8*8($sp) 1063 std r25,$FRAME-8*7($sp) 1064 std r26,$FRAME-8*6($sp) 1065 std r27,$FRAME-8*5($sp) 1066 std r28,$FRAME-8*4($sp) 1067 std r29,$FRAME-8*3($sp) 1068 std r30,$FRAME-8*2($sp) 1069 std r31,$FRAME-8*1($sp) 1070 1071 li $poly1,-1 1072 srdi $poly1,$poly1,32 # 0x00000000ffffffff 1073 li $poly3,1 1074 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 1075 1076 ld $a0,64($bp) # in2_z 1077 ld $a1,72($bp) 1078 ld $a2,80($bp) 1079 ld $a3,88($bp) 1080 mr $rp_real,$rp 1081 mr $ap_real,$ap 1082 mr $bp_real,$bp 1083 or $t0,$a0,$a1 1084 or $t2,$a2,$a3 1085 or $in2infty,$t0,$t2 1086 neg $t0,$in2infty 1087 or $in2infty,$in2infty,$t0 1088 sradi $in2infty,$in2infty,63 # !in2infty 1089 addi $rp,$sp,$Z2sqr 1090 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z); 1091 1092 ld $a0,64($ap_real) # in1_z 1093 ld $a1,72($ap_real) 1094 ld $a2,80($ap_real) 1095 ld $a3,88($ap_real) 1096 or $t0,$a0,$a1 1097 or $t2,$a2,$a3 1098 or $in1infty,$t0,$t2 1099 neg $t0,$in1infty 1100 or $in1infty,$in1infty,$t0 1101 sradi $in1infty,$in1infty,63 # !in1infty 1102 addi $rp,$sp,$Z1sqr 1103 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); 1104 1105 ld $bi,64($bp_real) 1106 ld $a0,$Z2sqr+0($sp) 1107 ld $a1,$Z2sqr+8($sp) 1108 ld $a2,$Z2sqr+16($sp) 1109 ld $a3,$Z2sqr+24($sp) 1110 addi $bp,$bp_real,64 1111 addi $rp,$sp,$S1 1112 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z); 1113 1114 ld $bi,64($ap_real) 1115 ld $a0,$Z1sqr+0($sp) 1116 ld $a1,$Z1sqr+8($sp) 1117 ld $a2,$Z1sqr+16($sp) 1118 ld $a3,$Z1sqr+24($sp) 1119 addi $bp,$ap_real,64 1120 addi $rp,$sp,$S2 1121 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); 1122 1123 ld $bi,32($ap_real) 1124 ld $a0,$S1+0($sp) 1125 ld $a1,$S1+8($sp) 1126 ld $a2,$S1+16($sp) 1127 ld $a3,$S1+24($sp) 1128 addi $bp,$ap_real,32 1129 addi $rp,$sp,$S1 1130 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y); 1131 1132 ld $bi,32($bp_real) 1133 ld $a0,$S2+0($sp) 1134 ld $a1,$S2+8($sp) 1135 ld $a2,$S2+16($sp) 1136 ld $a3,$S2+24($sp) 1137 addi $bp,$bp_real,32 1138 addi $rp,$sp,$S2 1139 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); 1140 1141 addi $bp,$sp,$S1 1142 ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont 1143 ld $a0,0($ap_real) 1144 ld $a1,8($ap_real) 1145 ld $a2,16($ap_real) 1146 ld $a3,24($ap_real) 1147 addi $rp,$sp,$R 1148 bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1); 1149 1150 or $acc0,$acc0,$acc1 # see if result is zero 1151 or $acc2,$acc2,$acc3 1152 or $temp,$acc0,$acc2 1153 1154 addi $bp,$sp,$Z2sqr 1155 addi $rp,$sp,$U1 1156 bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr); 1157 1158 ld $bi,$Z1sqr($sp) 1159 ld $a0,0($bp_real) 1160 ld $a1,8($bp_real) 1161 ld $a2,16($bp_real) 1162 ld $a3,24($bp_real) 1163 addi $bp,$sp,$Z1sqr 1164 addi $rp,$sp,$U2 1165 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr); 1166 1167 addi $bp,$sp,$U1 1168 ld $a0,$R+0($sp) # forward load for p256_sqr_mont 1169 ld $a1,$R+8($sp) 1170 ld $a2,$R+16($sp) 1171 ld $a3,$R+24($sp) 1172 addi $rp,$sp,$H 1173 bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1); 1174 1175 or $acc0,$acc0,$acc1 # see if result is zero 1176 or $acc2,$acc2,$acc3 1177 or. $acc0,$acc0,$acc2 1178 bne .Ladd_proceed # is_equal(U1,U2)? 1179 1180 and. $t0,$in1infty,$in2infty 1181 beq .Ladd_proceed # (in1infty || in2infty)? 1182 1183 cmpldi $temp,0 1184 beq .Ladd_double # is_equal(S1,S2)? 1185 1186 xor $a0,$a0,$a0 1187 std $a0,0($rp_real) 1188 std $a0,8($rp_real) 1189 std $a0,16($rp_real) 1190 std $a0,24($rp_real) 1191 std $a0,32($rp_real) 1192 std $a0,40($rp_real) 1193 std $a0,48($rp_real) 1194 std $a0,56($rp_real) 1195 std $a0,64($rp_real) 1196 std $a0,72($rp_real) 1197 std $a0,80($rp_real) 1198 std $a0,88($rp_real) 1199 b .Ladd_done 1200 1201.align 4 1202.Ladd_double: 1203 ld $bp,0($sp) # back-link 1204 mr $ap,$ap_real 1205 mr $rp,$rp_real 1206 ld r16,$FRAME-8*16($sp) 1207 ld r17,$FRAME-8*15($sp) 1208 ld r18,$FRAME-8*14($sp) 1209 ld r19,$FRAME-8*13($sp) 1210 stdu $bp,$FRAME-288($sp) # difference in stack frame sizes 1211 b .Ldouble_shortcut 1212 1213.align 4 1214.Ladd_proceed: 1215 addi $rp,$sp,$Rsqr 1216 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); 1217 1218 ld $bi,64($ap_real) 1219 ld $a0,$H+0($sp) 1220 ld $a1,$H+8($sp) 1221 ld $a2,$H+16($sp) 1222 ld $a3,$H+24($sp) 1223 addi $bp,$ap_real,64 1224 addi $rp,$sp,$res_z 1225 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); 1226 1227 ld $a0,$H+0($sp) 1228 ld $a1,$H+8($sp) 1229 ld $a2,$H+16($sp) 1230 ld $a3,$H+24($sp) 1231 addi $rp,$sp,$Hsqr 1232 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); 1233 1234 ld $bi,64($bp_real) 1235 ld $a0,$res_z+0($sp) 1236 ld $a1,$res_z+8($sp) 1237 ld $a2,$res_z+16($sp) 1238 ld $a3,$res_z+24($sp) 1239 addi $bp,$bp_real,64 1240 addi $rp,$sp,$res_z 1241 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z); 1242 1243 ld $bi,$H($sp) 1244 ld $a0,$Hsqr+0($sp) 1245 ld $a1,$Hsqr+8($sp) 1246 ld $a2,$Hsqr+16($sp) 1247 ld $a3,$Hsqr+24($sp) 1248 addi $bp,$sp,$H 1249 addi $rp,$sp,$Hcub 1250 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); 1251 1252 ld $bi,$Hsqr($sp) 1253 ld $a0,$U1+0($sp) 1254 ld $a1,$U1+8($sp) 1255 ld $a2,$U1+16($sp) 1256 ld $a3,$U1+24($sp) 1257 addi $bp,$sp,$Hsqr 1258 addi $rp,$sp,$U2 1259 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr); 1260 1261 mr $t0,$acc0 1262 mr $t1,$acc1 1263 mr $t2,$acc2 1264 mr $t3,$acc3 1265 addi $rp,$sp,$Hsqr 1266 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); 1267 1268 addi $bp,$sp,$Rsqr 1269 addi $rp,$sp,$res_x 1270 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); 1271 1272 addi $bp,$sp,$Hcub 1273 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); 1274 1275 addi $bp,$sp,$U2 1276 ld $bi,$Hcub($sp) # forward load for p256_mul_mont 1277 ld $a0,$S1+0($sp) 1278 ld $a1,$S1+8($sp) 1279 ld $a2,$S1+16($sp) 1280 ld $a3,$S1+24($sp) 1281 addi $rp,$sp,$res_y 1282 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); 1283 1284 addi $bp,$sp,$Hcub 1285 addi $rp,$sp,$S2 1286 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub); 1287 1288 ld $bi,$R($sp) 1289 ld $a0,$res_y+0($sp) 1290 ld $a1,$res_y+8($sp) 1291 ld $a2,$res_y+16($sp) 1292 ld $a3,$res_y+24($sp) 1293 addi $bp,$sp,$R 1294 addi $rp,$sp,$res_y 1295 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); 1296 1297 addi $bp,$sp,$S2 1298 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); 1299 1300 ld $t0,0($bp_real) # in2 1301 ld $t1,8($bp_real) 1302 ld $t2,16($bp_real) 1303 ld $t3,24($bp_real) 1304 ld $a0,$res_x+0($sp) # res 1305 ld $a1,$res_x+8($sp) 1306 ld $a2,$res_x+16($sp) 1307 ld $a3,$res_x+24($sp) 1308___ 1309for($i=0;$i<64;$i+=32) { # conditional moves 1310$code.=<<___; 1311 ld $acc0,$i+0($ap_real) # in1 1312 ld $acc1,$i+8($ap_real) 1313 ld $acc2,$i+16($ap_real) 1314 ld $acc3,$i+24($ap_real) 1315 andc $t0,$t0,$in1infty 1316 andc $t1,$t1,$in1infty 1317 andc $t2,$t2,$in1infty 1318 andc $t3,$t3,$in1infty 1319 and $a0,$a0,$in1infty 1320 and $a1,$a1,$in1infty 1321 and $a2,$a2,$in1infty 1322 and $a3,$a3,$in1infty 1323 or $t0,$t0,$a0 1324 or $t1,$t1,$a1 1325 or $t2,$t2,$a2 1326 or $t3,$t3,$a3 1327 andc $acc0,$acc0,$in2infty 1328 andc $acc1,$acc1,$in2infty 1329 andc $acc2,$acc2,$in2infty 1330 andc $acc3,$acc3,$in2infty 1331 and $t0,$t0,$in2infty 1332 and $t1,$t1,$in2infty 1333 and $t2,$t2,$in2infty 1334 and $t3,$t3,$in2infty 1335 or $acc0,$acc0,$t0 1336 or $acc1,$acc1,$t1 1337 or $acc2,$acc2,$t2 1338 or $acc3,$acc3,$t3 1339 1340 ld $t0,$i+32($bp_real) # in2 1341 ld $t1,$i+40($bp_real) 1342 ld $t2,$i+48($bp_real) 1343 ld $t3,$i+56($bp_real) 1344 ld $a0,$res_x+$i+32($sp) 1345 ld $a1,$res_x+$i+40($sp) 1346 ld $a2,$res_x+$i+48($sp) 1347 ld $a3,$res_x+$i+56($sp) 1348 std $acc0,$i+0($rp_real) 1349 std $acc1,$i+8($rp_real) 1350 std $acc2,$i+16($rp_real) 1351 std $acc3,$i+24($rp_real) 1352___ 1353} 1354$code.=<<___; 1355 ld $acc0,$i+0($ap_real) # in1 1356 ld $acc1,$i+8($ap_real) 1357 ld $acc2,$i+16($ap_real) 1358 ld $acc3,$i+24($ap_real) 1359 andc $t0,$t0,$in1infty 1360 andc $t1,$t1,$in1infty 1361 andc $t2,$t2,$in1infty 1362 andc $t3,$t3,$in1infty 1363 and $a0,$a0,$in1infty 1364 and $a1,$a1,$in1infty 1365 and $a2,$a2,$in1infty 1366 and $a3,$a3,$in1infty 1367 or $t0,$t0,$a0 1368 or $t1,$t1,$a1 1369 or $t2,$t2,$a2 1370 or $t3,$t3,$a3 1371 andc $acc0,$acc0,$in2infty 1372 andc $acc1,$acc1,$in2infty 1373 andc $acc2,$acc2,$in2infty 1374 andc $acc3,$acc3,$in2infty 1375 and $t0,$t0,$in2infty 1376 and $t1,$t1,$in2infty 1377 and $t2,$t2,$in2infty 1378 and $t3,$t3,$in2infty 1379 or $acc0,$acc0,$t0 1380 or $acc1,$acc1,$t1 1381 or $acc2,$acc2,$t2 1382 or $acc3,$acc3,$t3 1383 std $acc0,$i+0($rp_real) 1384 std $acc1,$i+8($rp_real) 1385 std $acc2,$i+16($rp_real) 1386 std $acc3,$i+24($rp_real) 1387 1388.Ladd_done: 1389 mtlr r0 1390 ld r16,$FRAME-8*16($sp) 1391 ld r17,$FRAME-8*15($sp) 1392 ld r18,$FRAME-8*14($sp) 1393 ld r19,$FRAME-8*13($sp) 1394 ld r20,$FRAME-8*12($sp) 1395 ld r21,$FRAME-8*11($sp) 1396 ld r22,$FRAME-8*10($sp) 1397 ld r23,$FRAME-8*9($sp) 1398 ld r24,$FRAME-8*8($sp) 1399 ld r25,$FRAME-8*7($sp) 1400 ld r26,$FRAME-8*6($sp) 1401 ld r27,$FRAME-8*5($sp) 1402 ld r28,$FRAME-8*4($sp) 1403 ld r29,$FRAME-8*3($sp) 1404 ld r30,$FRAME-8*2($sp) 1405 ld r31,$FRAME-8*1($sp) 1406 addi $sp,$sp,$FRAME 1407 blr 1408 .long 0 1409 .byte 0,12,4,0,0x80,16,3,0 1410 .long 0 1411.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1412___ 1413} 1414 1415######################################################################## 1416# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1417# const P256_POINT_AFFINE *in2); 1418if (1) { 1419my $FRAME = 64 + 32*10 + 16*8; 1420my ($res_x,$res_y,$res_z, 1421 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9)); 1422my $Z1sqr = $S2; 1423# above map() describes stack layout with 10 temporary 1424# 256-bit vectors on top. 1425my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); 1426 1427$code.=<<___; 1428.globl ecp_nistz256_point_add_affine 1429.align 5 1430ecp_nistz256_point_add_affine: 1431 stdu $sp,-$FRAME($sp) 1432 mflr r0 1433 std r16,$FRAME-8*16($sp) 1434 std r17,$FRAME-8*15($sp) 1435 std r18,$FRAME-8*14($sp) 1436 std r19,$FRAME-8*13($sp) 1437 std r20,$FRAME-8*12($sp) 1438 std r21,$FRAME-8*11($sp) 1439 std r22,$FRAME-8*10($sp) 1440 std r23,$FRAME-8*9($sp) 1441 std r24,$FRAME-8*8($sp) 1442 std r25,$FRAME-8*7($sp) 1443 std r26,$FRAME-8*6($sp) 1444 std r27,$FRAME-8*5($sp) 1445 std r28,$FRAME-8*4($sp) 1446 std r29,$FRAME-8*3($sp) 1447 std r30,$FRAME-8*2($sp) 1448 std r31,$FRAME-8*1($sp) 1449 1450 li $poly1,-1 1451 srdi $poly1,$poly1,32 # 0x00000000ffffffff 1452 li $poly3,1 1453 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 1454 1455 mr $rp_real,$rp 1456 mr $ap_real,$ap 1457 mr $bp_real,$bp 1458 1459 ld $a0,64($ap) # in1_z 1460 ld $a1,72($ap) 1461 ld $a2,80($ap) 1462 ld $a3,88($ap) 1463 or $t0,$a0,$a1 1464 or $t2,$a2,$a3 1465 or $in1infty,$t0,$t2 1466 neg $t0,$in1infty 1467 or $in1infty,$in1infty,$t0 1468 sradi $in1infty,$in1infty,63 # !in1infty 1469 1470 ld $acc0,0($bp) # in2_x 1471 ld $acc1,8($bp) 1472 ld $acc2,16($bp) 1473 ld $acc3,24($bp) 1474 ld $t0,32($bp) # in2_y 1475 ld $t1,40($bp) 1476 ld $t2,48($bp) 1477 ld $t3,56($bp) 1478 or $acc0,$acc0,$acc1 1479 or $acc2,$acc2,$acc3 1480 or $acc0,$acc0,$acc2 1481 or $t0,$t0,$t1 1482 or $t2,$t2,$t3 1483 or $t0,$t0,$t2 1484 or $in2infty,$acc0,$t0 1485 neg $t0,$in2infty 1486 or $in2infty,$in2infty,$t0 1487 sradi $in2infty,$in2infty,63 # !in2infty 1488 1489 addi $rp,$sp,$Z1sqr 1490 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); 1491 1492 mr $a0,$acc0 1493 mr $a1,$acc1 1494 mr $a2,$acc2 1495 mr $a3,$acc3 1496 ld $bi,0($bp_real) 1497 addi $bp,$bp_real,0 1498 addi $rp,$sp,$U2 1499 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x); 1500 1501 addi $bp,$ap_real,0 1502 ld $bi,64($ap_real) # forward load for p256_mul_mont 1503 ld $a0,$Z1sqr+0($sp) 1504 ld $a1,$Z1sqr+8($sp) 1505 ld $a2,$Z1sqr+16($sp) 1506 ld $a3,$Z1sqr+24($sp) 1507 addi $rp,$sp,$H 1508 bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x); 1509 1510 addi $bp,$ap_real,64 1511 addi $rp,$sp,$S2 1512 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); 1513 1514 ld $bi,64($ap_real) 1515 ld $a0,$H+0($sp) 1516 ld $a1,$H+8($sp) 1517 ld $a2,$H+16($sp) 1518 ld $a3,$H+24($sp) 1519 addi $bp,$ap_real,64 1520 addi $rp,$sp,$res_z 1521 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); 1522 1523 ld $bi,32($bp_real) 1524 ld $a0,$S2+0($sp) 1525 ld $a1,$S2+8($sp) 1526 ld $a2,$S2+16($sp) 1527 ld $a3,$S2+24($sp) 1528 addi $bp,$bp_real,32 1529 addi $rp,$sp,$S2 1530 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); 1531 1532 addi $bp,$ap_real,32 1533 ld $a0,$H+0($sp) # forward load for p256_sqr_mont 1534 ld $a1,$H+8($sp) 1535 ld $a2,$H+16($sp) 1536 ld $a3,$H+24($sp) 1537 addi $rp,$sp,$R 1538 bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y); 1539 1540 addi $rp,$sp,$Hsqr 1541 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); 1542 1543 ld $a0,$R+0($sp) 1544 ld $a1,$R+8($sp) 1545 ld $a2,$R+16($sp) 1546 ld $a3,$R+24($sp) 1547 addi $rp,$sp,$Rsqr 1548 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); 1549 1550 ld $bi,$H($sp) 1551 ld $a0,$Hsqr+0($sp) 1552 ld $a1,$Hsqr+8($sp) 1553 ld $a2,$Hsqr+16($sp) 1554 ld $a3,$Hsqr+24($sp) 1555 addi $bp,$sp,$H 1556 addi $rp,$sp,$Hcub 1557 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); 1558 1559 ld $bi,0($ap_real) 1560 ld $a0,$Hsqr+0($sp) 1561 ld $a1,$Hsqr+8($sp) 1562 ld $a2,$Hsqr+16($sp) 1563 ld $a3,$Hsqr+24($sp) 1564 addi $bp,$ap_real,0 1565 addi $rp,$sp,$U2 1566 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr); 1567 1568 mr $t0,$acc0 1569 mr $t1,$acc1 1570 mr $t2,$acc2 1571 mr $t3,$acc3 1572 addi $rp,$sp,$Hsqr 1573 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); 1574 1575 addi $bp,$sp,$Rsqr 1576 addi $rp,$sp,$res_x 1577 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); 1578 1579 addi $bp,$sp,$Hcub 1580 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); 1581 1582 addi $bp,$sp,$U2 1583 ld $bi,32($ap_real) # forward load for p256_mul_mont 1584 ld $a0,$Hcub+0($sp) 1585 ld $a1,$Hcub+8($sp) 1586 ld $a2,$Hcub+16($sp) 1587 ld $a3,$Hcub+24($sp) 1588 addi $rp,$sp,$res_y 1589 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); 1590 1591 addi $bp,$ap_real,32 1592 addi $rp,$sp,$S2 1593 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub); 1594 1595 ld $bi,$R($sp) 1596 ld $a0,$res_y+0($sp) 1597 ld $a1,$res_y+8($sp) 1598 ld $a2,$res_y+16($sp) 1599 ld $a3,$res_y+24($sp) 1600 addi $bp,$sp,$R 1601 addi $rp,$sp,$res_y 1602 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); 1603 1604 addi $bp,$sp,$S2 1605 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); 1606 1607 ld $t0,0($bp_real) # in2 1608 ld $t1,8($bp_real) 1609 ld $t2,16($bp_real) 1610 ld $t3,24($bp_real) 1611 ld $a0,$res_x+0($sp) # res 1612 ld $a1,$res_x+8($sp) 1613 ld $a2,$res_x+16($sp) 1614 ld $a3,$res_x+24($sp) 1615___ 1616for($i=0;$i<64;$i+=32) { # conditional moves 1617$code.=<<___; 1618 ld $acc0,$i+0($ap_real) # in1 1619 ld $acc1,$i+8($ap_real) 1620 ld $acc2,$i+16($ap_real) 1621 ld $acc3,$i+24($ap_real) 1622 andc $t0,$t0,$in1infty 1623 andc $t1,$t1,$in1infty 1624 andc $t2,$t2,$in1infty 1625 andc $t3,$t3,$in1infty 1626 and $a0,$a0,$in1infty 1627 and $a1,$a1,$in1infty 1628 and $a2,$a2,$in1infty 1629 and $a3,$a3,$in1infty 1630 or $t0,$t0,$a0 1631 or $t1,$t1,$a1 1632 or $t2,$t2,$a2 1633 or $t3,$t3,$a3 1634 andc $acc0,$acc0,$in2infty 1635 andc $acc1,$acc1,$in2infty 1636 andc $acc2,$acc2,$in2infty 1637 andc $acc3,$acc3,$in2infty 1638 and $t0,$t0,$in2infty 1639 and $t1,$t1,$in2infty 1640 and $t2,$t2,$in2infty 1641 and $t3,$t3,$in2infty 1642 or $acc0,$acc0,$t0 1643 or $acc1,$acc1,$t1 1644 or $acc2,$acc2,$t2 1645 or $acc3,$acc3,$t3 1646___ 1647$code.=<<___ if ($i==0); 1648 ld $t0,32($bp_real) # in2 1649 ld $t1,40($bp_real) 1650 ld $t2,48($bp_real) 1651 ld $t3,56($bp_real) 1652___ 1653$code.=<<___ if ($i==32); 1654 li $t0,1 # Lone_mont 1655 not $t1,$poly1 1656 li $t2,-1 1657 not $t3,$poly3 1658___ 1659$code.=<<___; 1660 ld $a0,$res_x+$i+32($sp) 1661 ld $a1,$res_x+$i+40($sp) 1662 ld $a2,$res_x+$i+48($sp) 1663 ld $a3,$res_x+$i+56($sp) 1664 std $acc0,$i+0($rp_real) 1665 std $acc1,$i+8($rp_real) 1666 std $acc2,$i+16($rp_real) 1667 std $acc3,$i+24($rp_real) 1668___ 1669} 1670$code.=<<___; 1671 ld $acc0,$i+0($ap_real) # in1 1672 ld $acc1,$i+8($ap_real) 1673 ld $acc2,$i+16($ap_real) 1674 ld $acc3,$i+24($ap_real) 1675 andc $t0,$t0,$in1infty 1676 andc $t1,$t1,$in1infty 1677 andc $t2,$t2,$in1infty 1678 andc $t3,$t3,$in1infty 1679 and $a0,$a0,$in1infty 1680 and $a1,$a1,$in1infty 1681 and $a2,$a2,$in1infty 1682 and $a3,$a3,$in1infty 1683 or $t0,$t0,$a0 1684 or $t1,$t1,$a1 1685 or $t2,$t2,$a2 1686 or $t3,$t3,$a3 1687 andc $acc0,$acc0,$in2infty 1688 andc $acc1,$acc1,$in2infty 1689 andc $acc2,$acc2,$in2infty 1690 andc $acc3,$acc3,$in2infty 1691 and $t0,$t0,$in2infty 1692 and $t1,$t1,$in2infty 1693 and $t2,$t2,$in2infty 1694 and $t3,$t3,$in2infty 1695 or $acc0,$acc0,$t0 1696 or $acc1,$acc1,$t1 1697 or $acc2,$acc2,$t2 1698 or $acc3,$acc3,$t3 1699 std $acc0,$i+0($rp_real) 1700 std $acc1,$i+8($rp_real) 1701 std $acc2,$i+16($rp_real) 1702 std $acc3,$i+24($rp_real) 1703 1704 mtlr r0 1705 ld r16,$FRAME-8*16($sp) 1706 ld r17,$FRAME-8*15($sp) 1707 ld r18,$FRAME-8*14($sp) 1708 ld r19,$FRAME-8*13($sp) 1709 ld r20,$FRAME-8*12($sp) 1710 ld r21,$FRAME-8*11($sp) 1711 ld r22,$FRAME-8*10($sp) 1712 ld r23,$FRAME-8*9($sp) 1713 ld r24,$FRAME-8*8($sp) 1714 ld r25,$FRAME-8*7($sp) 1715 ld r26,$FRAME-8*6($sp) 1716 ld r27,$FRAME-8*5($sp) 1717 ld r28,$FRAME-8*4($sp) 1718 ld r29,$FRAME-8*3($sp) 1719 ld r30,$FRAME-8*2($sp) 1720 ld r31,$FRAME-8*1($sp) 1721 addi $sp,$sp,$FRAME 1722 blr 1723 .long 0 1724 .byte 0,12,4,0,0x80,16,3,0 1725 .long 0 1726.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1727___ 1728} 1729if (1) { 1730my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21)); 1731my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0"); 1732 1733$code.=<<___; 1734######################################################################## 1735# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1736# uint64_t b[4]); 1737.globl ecp_nistz256_ord_mul_mont 1738.align 5 1739ecp_nistz256_ord_mul_mont: 1740 stdu $sp,-160($sp) 1741 std r18,48($sp) 1742 std r19,56($sp) 1743 std r20,64($sp) 1744 std r21,72($sp) 1745 std r22,80($sp) 1746 std r23,88($sp) 1747 std r24,96($sp) 1748 std r25,104($sp) 1749 std r26,112($sp) 1750 std r27,120($sp) 1751 std r28,128($sp) 1752 std r29,136($sp) 1753 std r30,144($sp) 1754 std r31,152($sp) 1755 1756 ld $a0,0($ap) 1757 ld $bi,0($bp) 1758 ld $a1,8($ap) 1759 ld $a2,16($ap) 1760 ld $a3,24($ap) 1761 1762 lis $ordk,0xccd1 1763 lis $ord0,0xf3b9 1764 lis $ord1,0xbce6 1765 ori $ordk,$ordk,0xc8aa 1766 ori $ord0,$ord0,0xcac2 1767 ori $ord1,$ord1,0xfaad 1768 sldi $ordk,$ordk,32 1769 sldi $ord0,$ord0,32 1770 sldi $ord1,$ord1,32 1771 oris $ordk,$ordk,0xee00 1772 oris $ord0,$ord0,0xfc63 1773 oris $ord1,$ord1,0xa717 1774 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f 1775 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 1776 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 1777 li $ord2,-1 # 0xffffffffffffffff 1778 sldi $ord3,$ord2,32 # 0xffffffff00000000 1779 li $zr,0 1780 1781 mulld $acc0,$a0,$bi # a[0]*b[0] 1782 mulhdu $t0,$a0,$bi 1783 1784 mulld $acc1,$a1,$bi # a[1]*b[0] 1785 mulhdu $t1,$a1,$bi 1786 1787 mulld $acc2,$a2,$bi # a[2]*b[0] 1788 mulhdu $t2,$a2,$bi 1789 1790 mulld $acc3,$a3,$bi # a[3]*b[0] 1791 mulhdu $acc4,$a3,$bi 1792 1793 mulld $t4,$acc0,$ordk 1794 1795 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication 1796 adde $acc2,$acc2,$t1 1797 adde $acc3,$acc3,$t2 1798 addze $acc4,$acc4 1799 li $acc5,0 1800___ 1801for ($i=1;$i<4;$i++) { 1802 ################################################################ 1803 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1804 # * abcdefgh 1805 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1806 # 1807 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1808 # rewrite above as: 1809 # 1810 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1811 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1812 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1813$code.=<<___; 1814 ld $bi,8*$i($bp) # b[i] 1815 1816 sldi $t0,$t4,32 1817 subfc $acc2,$t4,$acc2 1818 srdi $t1,$t4,32 1819 subfe $acc3,$t0,$acc3 1820 subfe $acc4,$t1,$acc4 1821 subfe $acc5,$zr,$acc5 1822 1823 addic $t0,$acc0,-1 # discarded 1824 mulhdu $t1,$ord0,$t4 1825 mulld $t2,$ord1,$t4 1826 mulhdu $t3,$ord1,$t4 1827 1828 adde $t2,$t2,$t1 1829 mulld $t0,$a0,$bi 1830 addze $t3,$t3 1831 mulld $t1,$a1,$bi 1832 1833 addc $acc0,$acc1,$t2 1834 mulld $t2,$a2,$bi 1835 adde $acc1,$acc2,$t3 1836 mulld $t3,$a3,$bi 1837 adde $acc2,$acc3,$t4 1838 adde $acc3,$acc4,$t4 1839 addze $acc4,$acc5 1840 1841 addc $acc0,$acc0,$t0 # accumulate low parts 1842 mulhdu $t0,$a0,$bi 1843 adde $acc1,$acc1,$t1 1844 mulhdu $t1,$a1,$bi 1845 adde $acc2,$acc2,$t2 1846 mulhdu $t2,$a2,$bi 1847 adde $acc3,$acc3,$t3 1848 mulhdu $t3,$a3,$bi 1849 addze $acc4,$acc4 1850 mulld $t4,$acc0,$ordk 1851 addc $acc1,$acc1,$t0 # accumulate high parts 1852 adde $acc2,$acc2,$t1 1853 adde $acc3,$acc3,$t2 1854 adde $acc4,$acc4,$t3 1855 addze $acc5,$zr 1856___ 1857} 1858$code.=<<___; 1859 sldi $t0,$t4,32 # last reduction 1860 subfc $acc2,$t4,$acc2 1861 srdi $t1,$t4,32 1862 subfe $acc3,$t0,$acc3 1863 subfe $acc4,$t1,$acc4 1864 subfe $acc5,$zr,$acc5 1865 1866 addic $t0,$acc0,-1 # discarded 1867 mulhdu $t1,$ord0,$t4 1868 mulld $t2,$ord1,$t4 1869 mulhdu $t3,$ord1,$t4 1870 1871 adde $t2,$t2,$t1 1872 addze $t3,$t3 1873 1874 addc $acc0,$acc1,$t2 1875 adde $acc1,$acc2,$t3 1876 adde $acc2,$acc3,$t4 1877 adde $acc3,$acc4,$t4 1878 addze $acc4,$acc5 1879 1880 subfc $acc0,$ord0,$acc0 # ret -= modulus 1881 subfe $acc1,$ord1,$acc1 1882 subfe $acc2,$ord2,$acc2 1883 subfe $acc3,$ord3,$acc3 1884 subfe $acc4,$zr,$acc4 1885 1886 and $t0,$ord0,$acc4 1887 and $t1,$ord1,$acc4 1888 addc $acc0,$acc0,$t0 # ret += modulus if borrow 1889 and $t3,$ord3,$acc4 1890 adde $acc1,$acc1,$t1 1891 adde $acc2,$acc2,$acc4 1892 adde $acc3,$acc3,$t3 1893 1894 std $acc0,0($rp) 1895 std $acc1,8($rp) 1896 std $acc2,16($rp) 1897 std $acc3,24($rp) 1898 1899 ld r18,48($sp) 1900 ld r19,56($sp) 1901 ld r20,64($sp) 1902 ld r21,72($sp) 1903 ld r22,80($sp) 1904 ld r23,88($sp) 1905 ld r24,96($sp) 1906 ld r25,104($sp) 1907 ld r26,112($sp) 1908 ld r27,120($sp) 1909 ld r28,128($sp) 1910 ld r29,136($sp) 1911 ld r30,144($sp) 1912 ld r31,152($sp) 1913 addi $sp,$sp,160 1914 blr 1915 .long 0 1916 .byte 0,12,4,0,0x80,14,3,0 1917 .long 0 1918.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1919 1920################################################################################ 1921# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1922# int rep); 1923.globl ecp_nistz256_ord_sqr_mont 1924.align 5 1925ecp_nistz256_ord_sqr_mont: 1926 stdu $sp,-160($sp) 1927 std r18,48($sp) 1928 std r19,56($sp) 1929 std r20,64($sp) 1930 std r21,72($sp) 1931 std r22,80($sp) 1932 std r23,88($sp) 1933 std r24,96($sp) 1934 std r25,104($sp) 1935 std r26,112($sp) 1936 std r27,120($sp) 1937 std r28,128($sp) 1938 std r29,136($sp) 1939 std r30,144($sp) 1940 std r31,152($sp) 1941 1942 mtctr $bp 1943 1944 ld $a0,0($ap) 1945 ld $a1,8($ap) 1946 ld $a2,16($ap) 1947 ld $a3,24($ap) 1948 1949 lis $ordk,0xccd1 1950 lis $ord0,0xf3b9 1951 lis $ord1,0xbce6 1952 ori $ordk,$ordk,0xc8aa 1953 ori $ord0,$ord0,0xcac2 1954 ori $ord1,$ord1,0xfaad 1955 sldi $ordk,$ordk,32 1956 sldi $ord0,$ord0,32 1957 sldi $ord1,$ord1,32 1958 oris $ordk,$ordk,0xee00 1959 oris $ord0,$ord0,0xfc63 1960 oris $ord1,$ord1,0xa717 1961 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f 1962 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 1963 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 1964 li $ord2,-1 # 0xffffffffffffffff 1965 sldi $ord3,$ord2,32 # 0xffffffff00000000 1966 li $zr,0 1967 b .Loop_ord_sqr 1968 1969.align 5 1970.Loop_ord_sqr: 1971 ################################################################ 1972 # | | | | | |a1*a0| | 1973 # | | | | |a2*a0| | | 1974 # | |a3*a2|a3*a0| | | | 1975 # | | | |a2*a1| | | | 1976 # | | |a3*a1| | | | | 1977 # *| | | | | | | | 2| 1978 # +|a3*a3|a2*a2|a1*a1|a0*a0| 1979 # |--+--+--+--+--+--+--+--| 1980 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1981 # 1982 # "can't overflow" below mark carrying into high part of 1983 # multiplication result, which can't overflow, because it 1984 # can never be all ones. 1985 1986 mulld $acc1,$a1,$a0 # a[1]*a[0] 1987 mulhdu $t1,$a1,$a0 1988 mulld $acc2,$a2,$a0 # a[2]*a[0] 1989 mulhdu $t2,$a2,$a0 1990 mulld $acc3,$a3,$a0 # a[3]*a[0] 1991 mulhdu $acc4,$a3,$a0 1992 1993 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication 1994 mulld $t0,$a2,$a1 # a[2]*a[1] 1995 mulhdu $t1,$a2,$a1 1996 adde $acc3,$acc3,$t2 1997 mulld $t2,$a3,$a1 # a[3]*a[1] 1998 mulhdu $t3,$a3,$a1 1999 addze $acc4,$acc4 # can't overflow 2000 2001 mulld $acc5,$a3,$a2 # a[3]*a[2] 2002 mulhdu $acc6,$a3,$a2 2003 2004 addc $t1,$t1,$t2 # accumulate high parts of multiplication 2005 mulld $acc0,$a0,$a0 # a[0]*a[0] 2006 addze $t2,$t3 # can't overflow 2007 2008 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication 2009 mulhdu $a0,$a0,$a0 2010 adde $acc4,$acc4,$t1 2011 mulld $t1,$a1,$a1 # a[1]*a[1] 2012 adde $acc5,$acc5,$t2 2013 mulhdu $a1,$a1,$a1 2014 addze $acc6,$acc6 # can't overflow 2015 2016 addc $acc1,$acc1,$acc1 # acc[1-6]*=2 2017 mulld $t2,$a2,$a2 # a[2]*a[2] 2018 adde $acc2,$acc2,$acc2 2019 mulhdu $a2,$a2,$a2 2020 adde $acc3,$acc3,$acc3 2021 mulld $t3,$a3,$a3 # a[3]*a[3] 2022 adde $acc4,$acc4,$acc4 2023 mulhdu $a3,$a3,$a3 2024 adde $acc5,$acc5,$acc5 2025 adde $acc6,$acc6,$acc6 2026 addze $acc7,$zr 2027 2028 addc $acc1,$acc1,$a0 # +a[i]*a[i] 2029 mulld $t4,$acc0,$ordk 2030 adde $acc2,$acc2,$t1 2031 adde $acc3,$acc3,$a1 2032 adde $acc4,$acc4,$t2 2033 adde $acc5,$acc5,$a2 2034 adde $acc6,$acc6,$t3 2035 adde $acc7,$acc7,$a3 2036___ 2037for($i=0; $i<4; $i++) { # reductions 2038$code.=<<___; 2039 addic $t0,$acc0,-1 # discarded 2040 mulhdu $t1,$ord0,$t4 2041 mulld $t2,$ord1,$t4 2042 mulhdu $t3,$ord1,$t4 2043 2044 adde $t2,$t2,$t1 2045 addze $t3,$t3 2046 2047 addc $acc0,$acc1,$t2 2048 adde $acc1,$acc2,$t3 2049 adde $acc2,$acc3,$t4 2050 adde $acc3,$zr,$t4 # can't overflow 2051___ 2052$code.=<<___ if ($i<3); 2053 mulld $t3,$acc0,$ordk 2054___ 2055$code.=<<___; 2056 sldi $t0,$t4,32 2057 subfc $acc1,$t4,$acc1 2058 srdi $t1,$t4,32 2059 subfe $acc2,$t0,$acc2 2060 subfe $acc3,$t1,$acc3 # can't borrow 2061___ 2062 ($t3,$t4) = ($t4,$t3); 2063} 2064$code.=<<___; 2065 addc $acc0,$acc0,$acc4 # accumulate upper half 2066 adde $acc1,$acc1,$acc5 2067 adde $acc2,$acc2,$acc6 2068 adde $acc3,$acc3,$acc7 2069 addze $acc4,$zr 2070 2071 subfc $acc0,$ord0,$acc0 # ret -= modulus 2072 subfe $acc1,$ord1,$acc1 2073 subfe $acc2,$ord2,$acc2 2074 subfe $acc3,$ord3,$acc3 2075 subfe $acc4,$zr,$acc4 2076 2077 and $t0,$ord0,$acc4 2078 and $t1,$ord1,$acc4 2079 addc $a0,$acc0,$t0 # ret += modulus if borrow 2080 and $t3,$ord3,$acc4 2081 adde $a1,$acc1,$t1 2082 adde $a2,$acc2,$acc4 2083 adde $a3,$acc3,$t3 2084 2085 bdnz .Loop_ord_sqr 2086 2087 std $a0,0($rp) 2088 std $a1,8($rp) 2089 std $a2,16($rp) 2090 std $a3,24($rp) 2091 2092 ld r18,48($sp) 2093 ld r19,56($sp) 2094 ld r20,64($sp) 2095 ld r21,72($sp) 2096 ld r22,80($sp) 2097 ld r23,88($sp) 2098 ld r24,96($sp) 2099 ld r25,104($sp) 2100 ld r26,112($sp) 2101 ld r27,120($sp) 2102 ld r28,128($sp) 2103 ld r29,136($sp) 2104 ld r30,144($sp) 2105 ld r31,152($sp) 2106 addi $sp,$sp,160 2107 blr 2108 .long 0 2109 .byte 0,12,4,0,0x80,14,3,0 2110 .long 0 2111.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 2112___ 2113} } 2114 2115######################################################################## 2116# scatter-gather subroutines 2117{ 2118my ($out,$inp,$index,$mask)=map("r$_",(3..7)); 2119$code.=<<___; 2120######################################################################## 2121# void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp, 2122# int index); 2123.globl ecp_nistz256_scatter_w5 2124.align 4 2125ecp_nistz256_scatter_w5: 2126 slwi $index,$index,2 2127 add $out,$out,$index 2128 2129 ld r8, 0($inp) # X 2130 ld r9, 8($inp) 2131 ld r10,16($inp) 2132 ld r11,24($inp) 2133 2134 stw r8, 64*0-4($out) 2135 srdi r8, r8, 32 2136 stw r9, 64*1-4($out) 2137 srdi r9, r9, 32 2138 stw r10,64*2-4($out) 2139 srdi r10,r10,32 2140 stw r11,64*3-4($out) 2141 srdi r11,r11,32 2142 stw r8, 64*4-4($out) 2143 stw r9, 64*5-4($out) 2144 stw r10,64*6-4($out) 2145 stw r11,64*7-4($out) 2146 addi $out,$out,64*8 2147 2148 ld r8, 32($inp) # Y 2149 ld r9, 40($inp) 2150 ld r10,48($inp) 2151 ld r11,56($inp) 2152 2153 stw r8, 64*0-4($out) 2154 srdi r8, r8, 32 2155 stw r9, 64*1-4($out) 2156 srdi r9, r9, 32 2157 stw r10,64*2-4($out) 2158 srdi r10,r10,32 2159 stw r11,64*3-4($out) 2160 srdi r11,r11,32 2161 stw r8, 64*4-4($out) 2162 stw r9, 64*5-4($out) 2163 stw r10,64*6-4($out) 2164 stw r11,64*7-4($out) 2165 addi $out,$out,64*8 2166 2167 ld r8, 64($inp) # Z 2168 ld r9, 72($inp) 2169 ld r10,80($inp) 2170 ld r11,88($inp) 2171 2172 stw r8, 64*0-4($out) 2173 srdi r8, r8, 32 2174 stw r9, 64*1-4($out) 2175 srdi r9, r9, 32 2176 stw r10,64*2-4($out) 2177 srdi r10,r10,32 2178 stw r11,64*3-4($out) 2179 srdi r11,r11,32 2180 stw r8, 64*4-4($out) 2181 stw r9, 64*5-4($out) 2182 stw r10,64*6-4($out) 2183 stw r11,64*7-4($out) 2184 2185 blr 2186 .long 0 2187 .byte 0,12,0x14,0,0,0,3,0 2188 .long 0 2189.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 2190 2191######################################################################## 2192# void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp, 2193# int index); 2194.globl ecp_nistz256_gather_w5 2195.align 4 2196ecp_nistz256_gather_w5: 2197 neg r0,$index 2198 sradi r0,r0,63 2199 2200 add $index,$index,r0 2201 slwi $index,$index,2 2202 add $inp,$inp,$index 2203 2204 lwz r5, 64*0($inp) 2205 lwz r6, 64*1($inp) 2206 lwz r7, 64*2($inp) 2207 lwz r8, 64*3($inp) 2208 lwz r9, 64*4($inp) 2209 lwz r10,64*5($inp) 2210 lwz r11,64*6($inp) 2211 lwz r12,64*7($inp) 2212 addi $inp,$inp,64*8 2213 sldi r9, r9, 32 2214 sldi r10,r10,32 2215 sldi r11,r11,32 2216 sldi r12,r12,32 2217 or r5,r5,r9 2218 or r6,r6,r10 2219 or r7,r7,r11 2220 or r8,r8,r12 2221 and r5,r5,r0 2222 and r6,r6,r0 2223 and r7,r7,r0 2224 and r8,r8,r0 2225 std r5,0($out) # X 2226 std r6,8($out) 2227 std r7,16($out) 2228 std r8,24($out) 2229 2230 lwz r5, 64*0($inp) 2231 lwz r6, 64*1($inp) 2232 lwz r7, 64*2($inp) 2233 lwz r8, 64*3($inp) 2234 lwz r9, 64*4($inp) 2235 lwz r10,64*5($inp) 2236 lwz r11,64*6($inp) 2237 lwz r12,64*7($inp) 2238 addi $inp,$inp,64*8 2239 sldi r9, r9, 32 2240 sldi r10,r10,32 2241 sldi r11,r11,32 2242 sldi r12,r12,32 2243 or r5,r5,r9 2244 or r6,r6,r10 2245 or r7,r7,r11 2246 or r8,r8,r12 2247 and r5,r5,r0 2248 and r6,r6,r0 2249 and r7,r7,r0 2250 and r8,r8,r0 2251 std r5,32($out) # Y 2252 std r6,40($out) 2253 std r7,48($out) 2254 std r8,56($out) 2255 2256 lwz r5, 64*0($inp) 2257 lwz r6, 64*1($inp) 2258 lwz r7, 64*2($inp) 2259 lwz r8, 64*3($inp) 2260 lwz r9, 64*4($inp) 2261 lwz r10,64*5($inp) 2262 lwz r11,64*6($inp) 2263 lwz r12,64*7($inp) 2264 sldi r9, r9, 32 2265 sldi r10,r10,32 2266 sldi r11,r11,32 2267 sldi r12,r12,32 2268 or r5,r5,r9 2269 or r6,r6,r10 2270 or r7,r7,r11 2271 or r8,r8,r12 2272 and r5,r5,r0 2273 and r6,r6,r0 2274 and r7,r7,r0 2275 and r8,r8,r0 2276 std r5,64($out) # Z 2277 std r6,72($out) 2278 std r7,80($out) 2279 std r8,88($out) 2280 2281 blr 2282 .long 0 2283 .byte 0,12,0x14,0,0,0,3,0 2284 .long 0 2285.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 2286 2287######################################################################## 2288# void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp, 2289# int index); 2290.globl ecp_nistz256_scatter_w7 2291.align 4 2292ecp_nistz256_scatter_w7: 2293 li r0,8 2294 mtctr r0 2295 add $out,$out,$index 2296 subi $inp,$inp,8 2297 2298.Loop_scatter_w7: 2299 ldu r0,8($inp) 2300 stb r0,64*0($out) 2301 srdi r0,r0,8 2302 stb r0,64*1($out) 2303 srdi r0,r0,8 2304 stb r0,64*2($out) 2305 srdi r0,r0,8 2306 stb r0,64*3($out) 2307 srdi r0,r0,8 2308 stb r0,64*4($out) 2309 srdi r0,r0,8 2310 stb r0,64*5($out) 2311 srdi r0,r0,8 2312 stb r0,64*6($out) 2313 srdi r0,r0,8 2314 stb r0,64*7($out) 2315 addi $out,$out,64*8 2316 bdnz .Loop_scatter_w7 2317 2318 blr 2319 .long 0 2320 .byte 0,12,0x14,0,0,0,3,0 2321 .long 0 2322.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 2323 2324######################################################################## 2325# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp, 2326# int index); 2327.globl ecp_nistz256_gather_w7 2328.align 4 2329ecp_nistz256_gather_w7: 2330 li r0,8 2331 mtctr r0 2332 neg r0,$index 2333 sradi r0,r0,63 2334 2335 add $index,$index,r0 2336 add $inp,$inp,$index 2337 subi $out,$out,8 2338 2339.Loop_gather_w7: 2340 lbz r5, 64*0($inp) 2341 lbz r6, 64*1($inp) 2342 lbz r7, 64*2($inp) 2343 lbz r8, 64*3($inp) 2344 lbz r9, 64*4($inp) 2345 lbz r10,64*5($inp) 2346 lbz r11,64*6($inp) 2347 lbz r12,64*7($inp) 2348 addi $inp,$inp,64*8 2349 2350 sldi r6, r6, 8 2351 sldi r7, r7, 16 2352 sldi r8, r8, 24 2353 sldi r9, r9, 32 2354 sldi r10,r10,40 2355 sldi r11,r11,48 2356 sldi r12,r12,56 2357 2358 or r5,r5,r6 2359 or r7,r7,r8 2360 or r9,r9,r10 2361 or r11,r11,r12 2362 or r5,r5,r7 2363 or r9,r9,r11 2364 or r5,r5,r9 2365 and r5,r5,r0 2366 stdu r5,8($out) 2367 bdnz .Loop_gather_w7 2368 2369 blr 2370 .long 0 2371 .byte 0,12,0x14,0,0,0,3,0 2372 .long 0 2373.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 2374___ 2375} 2376 2377foreach (split("\n",$code)) { 2378 s/\`([^\`]*)\`/eval $1/ge; 2379 2380 print $_,"\n"; 2381} 2382close STDOUT or die "error closing STDOUT: $!"; # enforce flush 2383