1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv8. 18# 19# February 2015. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. 23# 24# with/without -DECP_NISTZ256_ASM 25# Apple A7 +190-360% 26# Cortex-A53 +190-400% 27# Cortex-A57 +190-350% 28# Denver +230-400% 29# 30# Ranges denote minimum and maximum improvement coefficients depending 31# on benchmark. Lower coefficients are for ECDSA sign, server-side 32# operation. Keep in mind that +400% means 5x improvement. 33 34# The first two arguments should always be the flavour and output file path. 35if ($#ARGV < 1) { die "Not enough arguments provided. 36 Two arguments are necessary: the flavour and the output file path."; } 37 38$flavour = shift; 39$output = shift; 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 43( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 44die "can't locate arm-xlate.pl"; 45 46open OUT,"| \"$^X\" $xlate $flavour $output"; 47*STDOUT=*OUT; 48 49{ 50my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 51 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 52 map("x$_",(0..17,19,20)); 53 54my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 55 56$code.=<<___; 57#include "ring-core/arm_arch.h" 58 59.section .rodata 60.align 5 61.Lpoly: 62.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 63.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 64.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 65.Lone_mont: 66.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 67.Lone: 68.quad 1,0,0,0 69.Lord: 70.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 71.LordK: 72.quad 0xccd1c8aaee00bc4f 73.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 74.text 75 76// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 77// const BN_ULONG x2[4]); 78.globl ecp_nistz256_mul_mont 79.type ecp_nistz256_mul_mont,%function 80.align 4 81ecp_nistz256_mul_mont: 82 AARCH64_SIGN_LINK_REGISTER 83 stp x29,x30,[sp,#-32]! 84 add x29,sp,#0 85 stp x19,x20,[sp,#16] 86 87 ldr $bi,[$bp] // bp[0] 88 ldp $a0,$a1,[$ap] 89 ldp $a2,$a3,[$ap,#16] 90 adrp $poly3,:pg_hi21:.Lpoly 91 add $poly3,$poly3,:lo12:.Lpoly 92 ldr $poly1,[$poly3,#8] 93 ldr $poly3,[$poly3,#24] 94 95 bl __ecp_nistz256_mul_mont 96 97 ldp x19,x20,[sp,#16] 98 ldp x29,x30,[sp],#32 99 AARCH64_VALIDATE_LINK_REGISTER 100 ret 101.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 102 103// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 104.globl ecp_nistz256_sqr_mont 105.type ecp_nistz256_sqr_mont,%function 106.align 4 107ecp_nistz256_sqr_mont: 108 AARCH64_SIGN_LINK_REGISTER 109 stp x29,x30,[sp,#-32]! 110 add x29,sp,#0 111 stp x19,x20,[sp,#16] 112 113 ldp $a0,$a1,[$ap] 114 ldp $a2,$a3,[$ap,#16] 115 adrp $poly3,:pg_hi21:.Lpoly 116 add $poly3,$poly3,:lo12:.Lpoly 117 ldr $poly1,[$poly3,#8] 118 ldr $poly3,[$poly3,#24] 119 120 bl __ecp_nistz256_sqr_mont 121 122 ldp x19,x20,[sp,#16] 123 ldp x29,x30,[sp],#32 124 AARCH64_VALIDATE_LINK_REGISTER 125 ret 126.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 127 128// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 129.globl ecp_nistz256_neg 130.type ecp_nistz256_neg,%function 131.align 4 132ecp_nistz256_neg: 133 AARCH64_SIGN_LINK_REGISTER 134 stp x29,x30,[sp,#-16]! 135 add x29,sp,#0 136 137 mov $bp,$ap 138 mov $acc0,xzr // a = 0 139 mov $acc1,xzr 140 mov $acc2,xzr 141 mov $acc3,xzr 142 adrp $poly3,:pg_hi21:.Lpoly 143 add $poly3,$poly3,:lo12:.Lpoly 144 ldr $poly1,[$poly3,#8] 145 ldr $poly3,[$poly3,#24] 146 147 bl __ecp_nistz256_sub_from 148 149 ldp x29,x30,[sp],#16 150 AARCH64_VALIDATE_LINK_REGISTER 151 ret 152.size ecp_nistz256_neg,.-ecp_nistz256_neg 153 154// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 155// to $a0-$a3 and b[0] - to $bi 156.type __ecp_nistz256_mul_mont,%function 157.align 4 158__ecp_nistz256_mul_mont: 159 mul $acc0,$a0,$bi // a[0]*b[0] 160 umulh $t0,$a0,$bi 161 162 mul $acc1,$a1,$bi // a[1]*b[0] 163 umulh $t1,$a1,$bi 164 165 mul $acc2,$a2,$bi // a[2]*b[0] 166 umulh $t2,$a2,$bi 167 168 mul $acc3,$a3,$bi // a[3]*b[0] 169 umulh $t3,$a3,$bi 170 ldr $bi,[$bp,#8] // b[1] 171 172 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 173 lsl $t0,$acc0,#32 174 adcs $acc2,$acc2,$t1 175 lsr $t1,$acc0,#32 176 adcs $acc3,$acc3,$t2 177 adc $acc4,xzr,$t3 178 mov $acc5,xzr 179___ 180for($i=1;$i<4;$i++) { 181 # Reduction iteration is normally performed by accumulating 182 # result of multiplication of modulus by "magic" digit [and 183 # omitting least significant word, which is guaranteed to 184 # be 0], but thanks to special form of modulus and "magic" 185 # digit being equal to least significant word, it can be 186 # performed with additions and subtractions alone. Indeed: 187 # 188 # ffff0001.00000000.0000ffff.ffffffff 189 # * abcdefgh 190 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 191 # 192 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 193 # rewrite above as: 194 # 195 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 196 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 197 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 198 # 199 # or marking redundant operations: 200 # 201 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 202 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 203 # - 0000abcd.efgh0000.--------.--------.-------- 204 205$code.=<<___; 206 subs $t2,$acc0,$t0 // "*0xffff0001" 207 sbc $t3,$acc0,$t1 208 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 209 mul $t0,$a0,$bi // lo(a[0]*b[i]) 210 adcs $acc1,$acc2,$t1 211 mul $t1,$a1,$bi // lo(a[1]*b[i]) 212 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 213 mul $t2,$a2,$bi // lo(a[2]*b[i]) 214 adcs $acc3,$acc4,$t3 215 mul $t3,$a3,$bi // lo(a[3]*b[i]) 216 adc $acc4,$acc5,xzr 217 218 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 219 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 220 adcs $acc1,$acc1,$t1 221 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 222 adcs $acc2,$acc2,$t2 223 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 224 adcs $acc3,$acc3,$t3 225 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 226 adc $acc4,$acc4,xzr 227___ 228$code.=<<___ if ($i<3); 229 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 230___ 231$code.=<<___; 232 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 233 lsl $t0,$acc0,#32 234 adcs $acc2,$acc2,$t1 235 lsr $t1,$acc0,#32 236 adcs $acc3,$acc3,$t2 237 adcs $acc4,$acc4,$t3 238 adc $acc5,xzr,xzr 239___ 240} 241$code.=<<___; 242 // last reduction 243 subs $t2,$acc0,$t0 // "*0xffff0001" 244 sbc $t3,$acc0,$t1 245 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 246 adcs $acc1,$acc2,$t1 247 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 248 adcs $acc3,$acc4,$t3 249 adc $acc4,$acc5,xzr 250 251 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 252 sbcs $t1,$acc1,$poly1 253 sbcs $t2,$acc2,xzr 254 sbcs $t3,$acc3,$poly3 255 sbcs xzr,$acc4,xzr // did it borrow? 256 257 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 258 csel $acc1,$acc1,$t1,lo 259 csel $acc2,$acc2,$t2,lo 260 stp $acc0,$acc1,[$rp] 261 csel $acc3,$acc3,$t3,lo 262 stp $acc2,$acc3,[$rp,#16] 263 264 ret 265.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 266 267// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 268// to $a0-$a3 269.type __ecp_nistz256_sqr_mont,%function 270.align 4 271__ecp_nistz256_sqr_mont: 272 // | | | | | |a1*a0| | 273 // | | | | |a2*a0| | | 274 // | |a3*a2|a3*a0| | | | 275 // | | | |a2*a1| | | | 276 // | | |a3*a1| | | | | 277 // *| | | | | | | | 2| 278 // +|a3*a3|a2*a2|a1*a1|a0*a0| 279 // |--+--+--+--+--+--+--+--| 280 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 281 // 282 // "can't overflow" below mark carrying into high part of 283 // multiplication result, which can't overflow, because it 284 // can never be all ones. 285 286 mul $acc1,$a1,$a0 // a[1]*a[0] 287 umulh $t1,$a1,$a0 288 mul $acc2,$a2,$a0 // a[2]*a[0] 289 umulh $t2,$a2,$a0 290 mul $acc3,$a3,$a0 // a[3]*a[0] 291 umulh $acc4,$a3,$a0 292 293 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 294 mul $t0,$a2,$a1 // a[2]*a[1] 295 umulh $t1,$a2,$a1 296 adcs $acc3,$acc3,$t2 297 mul $t2,$a3,$a1 // a[3]*a[1] 298 umulh $t3,$a3,$a1 299 adc $acc4,$acc4,xzr // can't overflow 300 301 mul $acc5,$a3,$a2 // a[3]*a[2] 302 umulh $acc6,$a3,$a2 303 304 adds $t1,$t1,$t2 // accumulate high parts of multiplication 305 mul $acc0,$a0,$a0 // a[0]*a[0] 306 adc $t2,$t3,xzr // can't overflow 307 308 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 309 umulh $a0,$a0,$a0 310 adcs $acc4,$acc4,$t1 311 mul $t1,$a1,$a1 // a[1]*a[1] 312 adcs $acc5,$acc5,$t2 313 umulh $a1,$a1,$a1 314 adc $acc6,$acc6,xzr // can't overflow 315 316 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 317 mul $t2,$a2,$a2 // a[2]*a[2] 318 adcs $acc2,$acc2,$acc2 319 umulh $a2,$a2,$a2 320 adcs $acc3,$acc3,$acc3 321 mul $t3,$a3,$a3 // a[3]*a[3] 322 adcs $acc4,$acc4,$acc4 323 umulh $a3,$a3,$a3 324 adcs $acc5,$acc5,$acc5 325 adcs $acc6,$acc6,$acc6 326 adc $acc7,xzr,xzr 327 328 adds $acc1,$acc1,$a0 // +a[i]*a[i] 329 adcs $acc2,$acc2,$t1 330 adcs $acc3,$acc3,$a1 331 adcs $acc4,$acc4,$t2 332 adcs $acc5,$acc5,$a2 333 lsl $t0,$acc0,#32 334 adcs $acc6,$acc6,$t3 335 lsr $t1,$acc0,#32 336 adc $acc7,$acc7,$a3 337___ 338for($i=0;$i<3;$i++) { # reductions, see commentary in 339 # multiplication for details 340$code.=<<___; 341 subs $t2,$acc0,$t0 // "*0xffff0001" 342 sbc $t3,$acc0,$t1 343 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 344 adcs $acc1,$acc2,$t1 345 lsl $t0,$acc0,#32 346 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 347 lsr $t1,$acc0,#32 348 adc $acc3,$t3,xzr // can't overflow 349___ 350} 351$code.=<<___; 352 subs $t2,$acc0,$t0 // "*0xffff0001" 353 sbc $t3,$acc0,$t1 354 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 355 adcs $acc1,$acc2,$t1 356 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 357 adc $acc3,$t3,xzr // can't overflow 358 359 adds $acc0,$acc0,$acc4 // accumulate upper half 360 adcs $acc1,$acc1,$acc5 361 adcs $acc2,$acc2,$acc6 362 adcs $acc3,$acc3,$acc7 363 adc $acc4,xzr,xzr 364 365 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 366 sbcs $t1,$acc1,$poly1 367 sbcs $t2,$acc2,xzr 368 sbcs $t3,$acc3,$poly3 369 sbcs xzr,$acc4,xzr // did it borrow? 370 371 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 372 csel $acc1,$acc1,$t1,lo 373 csel $acc2,$acc2,$t2,lo 374 stp $acc0,$acc1,[$rp] 375 csel $acc3,$acc3,$t3,lo 376 stp $acc2,$acc3,[$rp,#16] 377 378 ret 379.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 380 381// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 382// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 383// contexts, e.g. in multiplication by 2 and 3... 384.type __ecp_nistz256_add_to,%function 385.align 4 386__ecp_nistz256_add_to: 387 adds $acc0,$acc0,$t0 // ret = a+b 388 adcs $acc1,$acc1,$t1 389 adcs $acc2,$acc2,$t2 390 adcs $acc3,$acc3,$t3 391 adc $ap,xzr,xzr // zap $ap 392 393 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 394 sbcs $t1,$acc1,$poly1 395 sbcs $t2,$acc2,xzr 396 sbcs $t3,$acc3,$poly3 397 sbcs xzr,$ap,xzr // did subtraction borrow? 398 399 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 400 csel $acc1,$acc1,$t1,lo 401 csel $acc2,$acc2,$t2,lo 402 stp $acc0,$acc1,[$rp] 403 csel $acc3,$acc3,$t3,lo 404 stp $acc2,$acc3,[$rp,#16] 405 406 ret 407.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to 408 409.type __ecp_nistz256_sub_from,%function 410.align 4 411__ecp_nistz256_sub_from: 412 ldp $t0,$t1,[$bp] 413 ldp $t2,$t3,[$bp,#16] 414 subs $acc0,$acc0,$t0 // ret = a-b 415 sbcs $acc1,$acc1,$t1 416 sbcs $acc2,$acc2,$t2 417 sbcs $acc3,$acc3,$t3 418 sbc $ap,xzr,xzr // zap $ap 419 420 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 421 adcs $t1,$acc1,$poly1 422 adcs $t2,$acc2,xzr 423 adc $t3,$acc3,$poly3 424 cmp $ap,xzr // did subtraction borrow? 425 426 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 427 csel $acc1,$acc1,$t1,eq 428 csel $acc2,$acc2,$t2,eq 429 stp $acc0,$acc1,[$rp] 430 csel $acc3,$acc3,$t3,eq 431 stp $acc2,$acc3,[$rp,#16] 432 433 ret 434.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 435 436.type __ecp_nistz256_sub_morf,%function 437.align 4 438__ecp_nistz256_sub_morf: 439 ldp $t0,$t1,[$bp] 440 ldp $t2,$t3,[$bp,#16] 441 subs $acc0,$t0,$acc0 // ret = b-a 442 sbcs $acc1,$t1,$acc1 443 sbcs $acc2,$t2,$acc2 444 sbcs $acc3,$t3,$acc3 445 sbc $ap,xzr,xzr // zap $ap 446 447 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 448 adcs $t1,$acc1,$poly1 449 adcs $t2,$acc2,xzr 450 adc $t3,$acc3,$poly3 451 cmp $ap,xzr // did subtraction borrow? 452 453 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 454 csel $acc1,$acc1,$t1,eq 455 csel $acc2,$acc2,$t2,eq 456 stp $acc0,$acc1,[$rp] 457 csel $acc3,$acc3,$t3,eq 458 stp $acc2,$acc3,[$rp,#16] 459 460 ret 461.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 462 463.type __ecp_nistz256_div_by_2,%function 464.align 4 465__ecp_nistz256_div_by_2: 466 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 467 adcs $t1,$acc1,$poly1 468 adcs $t2,$acc2,xzr 469 adcs $t3,$acc3,$poly3 470 adc $ap,xzr,xzr // zap $ap 471 tst $acc0,#1 // is a even? 472 473 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 474 csel $acc1,$acc1,$t1,eq 475 csel $acc2,$acc2,$t2,eq 476 csel $acc3,$acc3,$t3,eq 477 csel $ap,xzr,$ap,eq 478 479 lsr $acc0,$acc0,#1 // ret >>= 1 480 orr $acc0,$acc0,$acc1,lsl#63 481 lsr $acc1,$acc1,#1 482 orr $acc1,$acc1,$acc2,lsl#63 483 lsr $acc2,$acc2,#1 484 orr $acc2,$acc2,$acc3,lsl#63 485 lsr $acc3,$acc3,#1 486 stp $acc0,$acc1,[$rp] 487 orr $acc3,$acc3,$ap,lsl#63 488 stp $acc2,$acc3,[$rp,#16] 489 490 ret 491.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 492___ 493######################################################################## 494# following subroutines are "literal" implementation of those found in 495# ecp_nistz256.c 496# 497######################################################################## 498# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 499# 500{ 501my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 502# above map() describes stack layout with 4 temporary 503# 256-bit vectors on top. 504my ($rp_real,$ap_real) = map("x$_",(21,22)); 505 506$code.=<<___; 507.globl ecp_nistz256_point_double 508.type ecp_nistz256_point_double,%function 509.align 5 510ecp_nistz256_point_double: 511 AARCH64_SIGN_LINK_REGISTER 512 stp x29,x30,[sp,#-96]! 513 add x29,sp,#0 514 stp x19,x20,[sp,#16] 515 stp x21,x22,[sp,#32] 516 sub sp,sp,#32*4 517 518.Ldouble_shortcut: 519 ldp $acc0,$acc1,[$ap,#32] 520 mov $rp_real,$rp 521 ldp $acc2,$acc3,[$ap,#48] 522 mov $ap_real,$ap 523 adrp $poly3,:pg_hi21:.Lpoly 524 add $poly3,$poly3,:lo12:.Lpoly 525 ldr $poly1,[$poly3,#8] 526 mov $t0,$acc0 527 ldr $poly3,[$poly3,#24] 528 mov $t1,$acc1 529 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 530 mov $t2,$acc2 531 mov $t3,$acc3 532 ldp $a2,$a3,[$ap_real,#64+16] 533 add $rp,sp,#$S 534 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 535 536 add $rp,sp,#$Zsqr 537 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 538 539 ldp $t0,$t1,[$ap_real] 540 ldp $t2,$t3,[$ap_real,#16] 541 mov $a0,$acc0 // put Zsqr aside for p256_sub 542 mov $a1,$acc1 543 mov $a2,$acc2 544 mov $a3,$acc3 545 add $rp,sp,#$M 546 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 547 548 add $bp,$ap_real,#0 549 mov $acc0,$a0 // restore Zsqr 550 mov $acc1,$a1 551 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 552 mov $acc2,$a2 553 mov $acc3,$a3 554 ldp $a2,$a3,[sp,#$S+16] 555 add $rp,sp,#$Zsqr 556 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 557 558 add $rp,sp,#$S 559 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 560 561 ldr $bi,[$ap_real,#32] 562 ldp $a0,$a1,[$ap_real,#64] 563 ldp $a2,$a3,[$ap_real,#64+16] 564 add $bp,$ap_real,#32 565 add $rp,sp,#$tmp0 566 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 567 568 mov $t0,$acc0 569 mov $t1,$acc1 570 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 571 mov $t2,$acc2 572 mov $t3,$acc3 573 ldp $a2,$a3,[sp,#$S+16] 574 add $rp,$rp_real,#64 575 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 576 577 add $rp,sp,#$tmp0 578 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 579 580 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 581 ldp $a0,$a1,[sp,#$M] 582 ldp $a2,$a3,[sp,#$M+16] 583 add $rp,$rp_real,#32 584 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 585 586 add $bp,sp,#$Zsqr 587 add $rp,sp,#$M 588 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 589 590 mov $t0,$acc0 // duplicate M 591 mov $t1,$acc1 592 mov $t2,$acc2 593 mov $t3,$acc3 594 mov $a0,$acc0 // put M aside 595 mov $a1,$acc1 596 mov $a2,$acc2 597 mov $a3,$acc3 598 add $rp,sp,#$M 599 bl __ecp_nistz256_add_to 600 mov $t0,$a0 // restore M 601 mov $t1,$a1 602 ldr $bi,[$ap_real] // forward load for p256_mul_mont 603 mov $t2,$a2 604 ldp $a0,$a1,[sp,#$S] 605 mov $t3,$a3 606 ldp $a2,$a3,[sp,#$S+16] 607 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 608 609 add $bp,$ap_real,#0 610 add $rp,sp,#$S 611 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 612 613 mov $t0,$acc0 614 mov $t1,$acc1 615 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 616 mov $t2,$acc2 617 mov $t3,$acc3 618 ldp $a2,$a3,[sp,#$M+16] 619 add $rp,sp,#$tmp0 620 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 621 622 add $rp,$rp_real,#0 623 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 624 625 add $bp,sp,#$tmp0 626 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 627 628 add $bp,sp,#$S 629 add $rp,sp,#$S 630 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 631 632 ldr $bi,[sp,#$M] 633 mov $a0,$acc0 // copy S 634 mov $a1,$acc1 635 mov $a2,$acc2 636 mov $a3,$acc3 637 add $bp,sp,#$M 638 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 639 640 add $bp,$rp_real,#32 641 add $rp,$rp_real,#32 642 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 643 644 add sp,x29,#0 // destroy frame 645 ldp x19,x20,[x29,#16] 646 ldp x21,x22,[x29,#32] 647 ldp x29,x30,[sp],#96 648 AARCH64_VALIDATE_LINK_REGISTER 649 ret 650.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 651___ 652} 653 654######################################################################## 655# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 656# const P256_POINT *in2); 657{ 658my ($res_x,$res_y,$res_z, 659 $H,$Hsqr,$R,$Rsqr,$Hcub, 660 $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); 661my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 662# above map() describes stack layout with 12 temporary 663# 256-bit vectors on top. 664my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); 665 666$code.=<<___; 667.globl ecp_nistz256_point_add 668.type ecp_nistz256_point_add,%function 669.align 5 670ecp_nistz256_point_add: 671 AARCH64_SIGN_LINK_REGISTER 672 stp x29,x30,[sp,#-96]! 673 add x29,sp,#0 674 stp x19,x20,[sp,#16] 675 stp x21,x22,[sp,#32] 676 stp x23,x24,[sp,#48] 677 stp x25,x26,[sp,#64] 678 stp x27,x28,[sp,#80] 679 sub sp,sp,#32*12 680 681 ldp $a0,$a1,[$bp,#64] // in2_z 682 ldp $a2,$a3,[$bp,#64+16] 683 mov $rp_real,$rp 684 mov $ap_real,$ap 685 mov $bp_real,$bp 686 adrp $poly3,:pg_hi21:.Lpoly 687 add $poly3,$poly3,:lo12:.Lpoly 688 ldr $poly1,[$poly3,#8] 689 ldr $poly3,[$poly3,#24] 690 orr $t0,$a0,$a1 691 orr $t2,$a2,$a3 692 orr $in2infty,$t0,$t2 693 cmp $in2infty,#0 694 csetm $in2infty,ne // ~in2infty 695 add $rp,sp,#$Z2sqr 696 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 697 698 ldp $a0,$a1,[$ap_real,#64] // in1_z 699 ldp $a2,$a3,[$ap_real,#64+16] 700 orr $t0,$a0,$a1 701 orr $t2,$a2,$a3 702 orr $in1infty,$t0,$t2 703 cmp $in1infty,#0 704 csetm $in1infty,ne // ~in1infty 705 add $rp,sp,#$Z1sqr 706 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 707 708 ldr $bi,[$bp_real,#64] 709 ldp $a0,$a1,[sp,#$Z2sqr] 710 ldp $a2,$a3,[sp,#$Z2sqr+16] 711 add $bp,$bp_real,#64 712 add $rp,sp,#$S1 713 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 714 715 ldr $bi,[$ap_real,#64] 716 ldp $a0,$a1,[sp,#$Z1sqr] 717 ldp $a2,$a3,[sp,#$Z1sqr+16] 718 add $bp,$ap_real,#64 719 add $rp,sp,#$S2 720 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 721 722 ldr $bi,[$ap_real,#32] 723 ldp $a0,$a1,[sp,#$S1] 724 ldp $a2,$a3,[sp,#$S1+16] 725 add $bp,$ap_real,#32 726 add $rp,sp,#$S1 727 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 728 729 ldr $bi,[$bp_real,#32] 730 ldp $a0,$a1,[sp,#$S2] 731 ldp $a2,$a3,[sp,#$S2+16] 732 add $bp,$bp_real,#32 733 add $rp,sp,#$S2 734 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 735 736 add $bp,sp,#$S1 737 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont 738 ldp $a0,$a1,[$ap_real] 739 ldp $a2,$a3,[$ap_real,#16] 740 add $rp,sp,#$R 741 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 742 743 orr $acc0,$acc0,$acc1 // see if result is zero 744 orr $acc2,$acc2,$acc3 745 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) 746 747 add $bp,sp,#$Z2sqr 748 add $rp,sp,#$U1 749 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 750 751 ldr $bi,[sp,#$Z1sqr] 752 ldp $a0,$a1,[$bp_real] 753 ldp $a2,$a3,[$bp_real,#16] 754 add $bp,sp,#$Z1sqr 755 add $rp,sp,#$U2 756 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 757 758 add $bp,sp,#$U1 759 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont 760 ldp $a2,$a3,[sp,#$R+16] 761 add $rp,sp,#$H 762 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 763 764 orr $acc0,$acc0,$acc1 // see if result is zero 765 orr $acc2,$acc2,$acc3 766 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) 767 768 mvn $temp1,$in1infty // -1/0 -> 0/-1 769 mvn $temp2,$in2infty // -1/0 -> 0/-1 770 orr $acc0,$acc0,$temp1 771 orr $acc0,$acc0,$temp2 772 orr $acc0,$acc0,$temp0 773 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 774 775.Ladd_double: 776 mov $ap,$ap_real 777 mov $rp,$rp_real 778 ldp x23,x24,[x29,#48] 779 ldp x25,x26,[x29,#64] 780 ldp x27,x28,[x29,#80] 781 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 782 b .Ldouble_shortcut 783 784.align 4 785.Ladd_proceed: 786 add $rp,sp,#$Rsqr 787 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 788 789 ldr $bi,[$ap_real,#64] 790 ldp $a0,$a1,[sp,#$H] 791 ldp $a2,$a3,[sp,#$H+16] 792 add $bp,$ap_real,#64 793 add $rp,sp,#$res_z 794 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 795 796 ldp $a0,$a1,[sp,#$H] 797 ldp $a2,$a3,[sp,#$H+16] 798 add $rp,sp,#$Hsqr 799 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 800 801 ldr $bi,[$bp_real,#64] 802 ldp $a0,$a1,[sp,#$res_z] 803 ldp $a2,$a3,[sp,#$res_z+16] 804 add $bp,$bp_real,#64 805 add $rp,sp,#$res_z 806 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 807 808 ldr $bi,[sp,#$H] 809 ldp $a0,$a1,[sp,#$Hsqr] 810 ldp $a2,$a3,[sp,#$Hsqr+16] 811 add $bp,sp,#$H 812 add $rp,sp,#$Hcub 813 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 814 815 ldr $bi,[sp,#$Hsqr] 816 ldp $a0,$a1,[sp,#$U1] 817 ldp $a2,$a3,[sp,#$U1+16] 818 add $bp,sp,#$Hsqr 819 add $rp,sp,#$U2 820 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 821 822 mov $t0,$acc0 823 mov $t1,$acc1 824 mov $t2,$acc2 825 mov $t3,$acc3 826 add $rp,sp,#$Hsqr 827 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 828 829 add $bp,sp,#$Rsqr 830 add $rp,sp,#$res_x 831 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 832 833 add $bp,sp,#$Hcub 834 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 835 836 add $bp,sp,#$U2 837 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont 838 ldp $a0,$a1,[sp,#$S1] 839 ldp $a2,$a3,[sp,#$S1+16] 840 add $rp,sp,#$res_y 841 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 842 843 add $bp,sp,#$Hcub 844 add $rp,sp,#$S2 845 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 846 847 ldr $bi,[sp,#$R] 848 ldp $a0,$a1,[sp,#$res_y] 849 ldp $a2,$a3,[sp,#$res_y+16] 850 add $bp,sp,#$R 851 add $rp,sp,#$res_y 852 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 853 854 add $bp,sp,#$S2 855 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 856 857 ldp $a0,$a1,[sp,#$res_x] // res 858 ldp $a2,$a3,[sp,#$res_x+16] 859 ldp $t0,$t1,[$bp_real] // in2 860 ldp $t2,$t3,[$bp_real,#16] 861___ 862for($i=0;$i<64;$i+=32) { # conditional moves 863$code.=<<___; 864 ldp $acc0,$acc1,[$ap_real,#$i] // in1 865 cmp $in1infty,#0 // ~$in1intfy, remember? 866 ldp $acc2,$acc3,[$ap_real,#$i+16] 867 csel $t0,$a0,$t0,ne 868 csel $t1,$a1,$t1,ne 869 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 870 csel $t2,$a2,$t2,ne 871 csel $t3,$a3,$t3,ne 872 cmp $in2infty,#0 // ~$in2intfy, remember? 873 ldp $a2,$a3,[sp,#$res_x+$i+48] 874 csel $acc0,$t0,$acc0,ne 875 csel $acc1,$t1,$acc1,ne 876 ldp $t0,$t1,[$bp_real,#$i+32] // in2 877 csel $acc2,$t2,$acc2,ne 878 csel $acc3,$t3,$acc3,ne 879 ldp $t2,$t3,[$bp_real,#$i+48] 880 stp $acc0,$acc1,[$rp_real,#$i] 881 stp $acc2,$acc3,[$rp_real,#$i+16] 882___ 883} 884$code.=<<___; 885 ldp $acc0,$acc1,[$ap_real,#$i] // in1 886 cmp $in1infty,#0 // ~$in1intfy, remember? 887 ldp $acc2,$acc3,[$ap_real,#$i+16] 888 csel $t0,$a0,$t0,ne 889 csel $t1,$a1,$t1,ne 890 csel $t2,$a2,$t2,ne 891 csel $t3,$a3,$t3,ne 892 cmp $in2infty,#0 // ~$in2intfy, remember? 893 csel $acc0,$t0,$acc0,ne 894 csel $acc1,$t1,$acc1,ne 895 csel $acc2,$t2,$acc2,ne 896 csel $acc3,$t3,$acc3,ne 897 stp $acc0,$acc1,[$rp_real,#$i] 898 stp $acc2,$acc3,[$rp_real,#$i+16] 899 900.Ladd_done: 901 add sp,x29,#0 // destroy frame 902 ldp x19,x20,[x29,#16] 903 ldp x21,x22,[x29,#32] 904 ldp x23,x24,[x29,#48] 905 ldp x25,x26,[x29,#64] 906 ldp x27,x28,[x29,#80] 907 ldp x29,x30,[sp],#96 908 AARCH64_VALIDATE_LINK_REGISTER 909 ret 910.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 911___ 912} 913 914######################################################################## 915# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 916# const P256_POINT_AFFINE *in2); 917{ 918my ($res_x,$res_y,$res_z, 919 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 920my $Z1sqr = $S2; 921# above map() describes stack layout with 10 temporary 922# 256-bit vectors on top. 923my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 924 925$code.=<<___; 926.globl ecp_nistz256_point_add_affine 927.type ecp_nistz256_point_add_affine,%function 928.align 5 929ecp_nistz256_point_add_affine: 930 AARCH64_SIGN_LINK_REGISTER 931 stp x29,x30,[sp,#-80]! 932 add x29,sp,#0 933 stp x19,x20,[sp,#16] 934 stp x21,x22,[sp,#32] 935 stp x23,x24,[sp,#48] 936 stp x25,x26,[sp,#64] 937 sub sp,sp,#32*10 938 939 mov $rp_real,$rp 940 mov $ap_real,$ap 941 mov $bp_real,$bp 942 adrp $poly3,:pg_hi21:.Lpoly 943 add $poly3,$poly3,:lo12:.Lpoly 944 ldr $poly1,[$poly3,#8] 945 ldr $poly3,[$poly3,#24] 946 947 ldp $a0,$a1,[$ap,#64] // in1_z 948 ldp $a2,$a3,[$ap,#64+16] 949 orr $t0,$a0,$a1 950 orr $t2,$a2,$a3 951 orr $in1infty,$t0,$t2 952 cmp $in1infty,#0 953 csetm $in1infty,ne // ~in1infty 954 955 ldp $acc0,$acc1,[$bp] // in2_x 956 ldp $acc2,$acc3,[$bp,#16] 957 ldp $t0,$t1,[$bp,#32] // in2_y 958 ldp $t2,$t3,[$bp,#48] 959 orr $acc0,$acc0,$acc1 960 orr $acc2,$acc2,$acc3 961 orr $t0,$t0,$t1 962 orr $t2,$t2,$t3 963 orr $acc0,$acc0,$acc2 964 orr $t0,$t0,$t2 965 orr $in2infty,$acc0,$t0 966 cmp $in2infty,#0 967 csetm $in2infty,ne // ~in2infty 968 969 add $rp,sp,#$Z1sqr 970 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 971 972 mov $a0,$acc0 973 mov $a1,$acc1 974 mov $a2,$acc2 975 mov $a3,$acc3 976 ldr $bi,[$bp_real] 977 add $bp,$bp_real,#0 978 add $rp,sp,#$U2 979 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 980 981 add $bp,$ap_real,#0 982 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 983 ldp $a0,$a1,[sp,#$Z1sqr] 984 ldp $a2,$a3,[sp,#$Z1sqr+16] 985 add $rp,sp,#$H 986 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 987 988 add $bp,$ap_real,#64 989 add $rp,sp,#$S2 990 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 991 992 ldr $bi,[$ap_real,#64] 993 ldp $a0,$a1,[sp,#$H] 994 ldp $a2,$a3,[sp,#$H+16] 995 add $bp,$ap_real,#64 996 add $rp,sp,#$res_z 997 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 998 999 ldr $bi,[$bp_real,#32] 1000 ldp $a0,$a1,[sp,#$S2] 1001 ldp $a2,$a3,[sp,#$S2+16] 1002 add $bp,$bp_real,#32 1003 add $rp,sp,#$S2 1004 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1005 1006 add $bp,$ap_real,#32 1007 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 1008 ldp $a2,$a3,[sp,#$H+16] 1009 add $rp,sp,#$R 1010 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1011 1012 add $rp,sp,#$Hsqr 1013 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1014 1015 ldp $a0,$a1,[sp,#$R] 1016 ldp $a2,$a3,[sp,#$R+16] 1017 add $rp,sp,#$Rsqr 1018 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1019 1020 ldr $bi,[sp,#$H] 1021 ldp $a0,$a1,[sp,#$Hsqr] 1022 ldp $a2,$a3,[sp,#$Hsqr+16] 1023 add $bp,sp,#$H 1024 add $rp,sp,#$Hcub 1025 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1026 1027 ldr $bi,[$ap_real] 1028 ldp $a0,$a1,[sp,#$Hsqr] 1029 ldp $a2,$a3,[sp,#$Hsqr+16] 1030 add $bp,$ap_real,#0 1031 add $rp,sp,#$U2 1032 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1033 1034 mov $t0,$acc0 1035 mov $t1,$acc1 1036 mov $t2,$acc2 1037 mov $t3,$acc3 1038 add $rp,sp,#$Hsqr 1039 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 1040 1041 add $bp,sp,#$Rsqr 1042 add $rp,sp,#$res_x 1043 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1044 1045 add $bp,sp,#$Hcub 1046 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1047 1048 add $bp,sp,#$U2 1049 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 1050 ldp $a0,$a1,[sp,#$Hcub] 1051 ldp $a2,$a3,[sp,#$Hcub+16] 1052 add $rp,sp,#$res_y 1053 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1054 1055 add $bp,$ap_real,#32 1056 add $rp,sp,#$S2 1057 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1058 1059 ldr $bi,[sp,#$R] 1060 ldp $a0,$a1,[sp,#$res_y] 1061 ldp $a2,$a3,[sp,#$res_y+16] 1062 add $bp,sp,#$R 1063 add $rp,sp,#$res_y 1064 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1065 1066 add $bp,sp,#$S2 1067 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1068 1069 ldp $a0,$a1,[sp,#$res_x] // res 1070 ldp $a2,$a3,[sp,#$res_x+16] 1071 ldp $t0,$t1,[$bp_real] // in2 1072 ldp $t2,$t3,[$bp_real,#16] 1073___ 1074for($i=0;$i<64;$i+=32) { # conditional moves 1075$code.=<<___; 1076 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1077 cmp $in1infty,#0 // ~$in1intfy, remember? 1078 ldp $acc2,$acc3,[$ap_real,#$i+16] 1079 csel $t0,$a0,$t0,ne 1080 csel $t1,$a1,$t1,ne 1081 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 1082 csel $t2,$a2,$t2,ne 1083 csel $t3,$a3,$t3,ne 1084 cmp $in2infty,#0 // ~$in2intfy, remember? 1085 ldp $a2,$a3,[sp,#$res_x+$i+48] 1086 csel $acc0,$t0,$acc0,ne 1087 csel $acc1,$t1,$acc1,ne 1088 ldp $t0,$t1,[$bp_real,#$i+32] // in2 1089 csel $acc2,$t2,$acc2,ne 1090 csel $acc3,$t3,$acc3,ne 1091 ldp $t2,$t3,[$bp_real,#$i+48] 1092 stp $acc0,$acc1,[$rp_real,#$i] 1093 stp $acc2,$acc3,[$rp_real,#$i+16] 1094___ 1095$code.=<<___ if ($i == 0); 1096 adrp $bp_real,:pg_hi21:.Lone_mont-64 1097 add $bp_real,$bp_real,:lo12:.Lone_mont-64 1098___ 1099} 1100$code.=<<___; 1101 ldp $acc0,$acc1,[$ap_real,#$i] // in1 1102 cmp $in1infty,#0 // ~$in1intfy, remember? 1103 ldp $acc2,$acc3,[$ap_real,#$i+16] 1104 csel $t0,$a0,$t0,ne 1105 csel $t1,$a1,$t1,ne 1106 csel $t2,$a2,$t2,ne 1107 csel $t3,$a3,$t3,ne 1108 cmp $in2infty,#0 // ~$in2intfy, remember? 1109 csel $acc0,$t0,$acc0,ne 1110 csel $acc1,$t1,$acc1,ne 1111 csel $acc2,$t2,$acc2,ne 1112 csel $acc3,$t3,$acc3,ne 1113 stp $acc0,$acc1,[$rp_real,#$i] 1114 stp $acc2,$acc3,[$rp_real,#$i+16] 1115 1116 add sp,x29,#0 // destroy frame 1117 ldp x19,x20,[x29,#16] 1118 ldp x21,x22,[x29,#32] 1119 ldp x23,x24,[x29,#48] 1120 ldp x25,x26,[x29,#64] 1121 ldp x29,x30,[sp],#80 1122 AARCH64_VALIDATE_LINK_REGISTER 1123 ret 1124.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1125___ 1126} 1127if (1) { 1128my ($ord0,$ord1) = ($poly1,$poly3); 1129my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); 1130my $acc7 = $bi; 1131 1132$code.=<<___; 1133//////////////////////////////////////////////////////////////////////// 1134// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1135// uint64_t b[4]); 1136.globl ecp_nistz256_ord_mul_mont 1137.type ecp_nistz256_ord_mul_mont,%function 1138.align 4 1139ecp_nistz256_ord_mul_mont: 1140 AARCH64_VALID_CALL_TARGET 1141 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1142 stp x29,x30,[sp,#-64]! 1143 add x29,sp,#0 1144 stp x19,x20,[sp,#16] 1145 stp x21,x22,[sp,#32] 1146 stp x23,x24,[sp,#48] 1147 1148 adrp $ordk,:pg_hi21:.Lord 1149 add $ordk,$ordk,:lo12:.Lord 1150 ldr $bi,[$bp] // bp[0] 1151 ldp $a0,$a1,[$ap] 1152 ldp $a2,$a3,[$ap,#16] 1153 1154 ldp $ord0,$ord1,[$ordk,#0] 1155 ldp $ord2,$ord3,[$ordk,#16] 1156 ldr $ordk,[$ordk,#32] 1157 1158 mul $acc0,$a0,$bi // a[0]*b[0] 1159 umulh $t0,$a0,$bi 1160 1161 mul $acc1,$a1,$bi // a[1]*b[0] 1162 umulh $t1,$a1,$bi 1163 1164 mul $acc2,$a2,$bi // a[2]*b[0] 1165 umulh $t2,$a2,$bi 1166 1167 mul $acc3,$a3,$bi // a[3]*b[0] 1168 umulh $acc4,$a3,$bi 1169 1170 mul $t4,$acc0,$ordk 1171 1172 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 1173 adcs $acc2,$acc2,$t1 1174 adcs $acc3,$acc3,$t2 1175 adc $acc4,$acc4,xzr 1176 mov $acc5,xzr 1177___ 1178for ($i=1;$i<4;$i++) { 1179 ################################################################ 1180 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz 1181 # * abcdefgh 1182 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1183 # 1184 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 1185 # rewrite above as: 1186 # 1187 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx 1188 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 1189 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh 1190$code.=<<___; 1191 ldr $bi,[$bp,#8*$i] // b[i] 1192 1193 lsl $t0,$t4,#32 1194 subs $acc2,$acc2,$t4 1195 lsr $t1,$t4,#32 1196 sbcs $acc3,$acc3,$t0 1197 sbcs $acc4,$acc4,$t1 1198 sbc $acc5,$acc5,xzr 1199 1200 subs xzr,$acc0,#1 1201 umulh $t1,$ord0,$t4 1202 mul $t2,$ord1,$t4 1203 umulh $t3,$ord1,$t4 1204 1205 adcs $t2,$t2,$t1 1206 mul $t0,$a0,$bi 1207 adc $t3,$t3,xzr 1208 mul $t1,$a1,$bi 1209 1210 adds $acc0,$acc1,$t2 1211 mul $t2,$a2,$bi 1212 adcs $acc1,$acc2,$t3 1213 mul $t3,$a3,$bi 1214 adcs $acc2,$acc3,$t4 1215 adcs $acc3,$acc4,$t4 1216 adc $acc4,$acc5,xzr 1217 1218 adds $acc0,$acc0,$t0 // accumulate low parts 1219 umulh $t0,$a0,$bi 1220 adcs $acc1,$acc1,$t1 1221 umulh $t1,$a1,$bi 1222 adcs $acc2,$acc2,$t2 1223 umulh $t2,$a2,$bi 1224 adcs $acc3,$acc3,$t3 1225 umulh $t3,$a3,$bi 1226 adc $acc4,$acc4,xzr 1227 mul $t4,$acc0,$ordk 1228 adds $acc1,$acc1,$t0 // accumulate high parts 1229 adcs $acc2,$acc2,$t1 1230 adcs $acc3,$acc3,$t2 1231 adcs $acc4,$acc4,$t3 1232 adc $acc5,xzr,xzr 1233___ 1234} 1235$code.=<<___; 1236 lsl $t0,$t4,#32 // last reduction 1237 subs $acc2,$acc2,$t4 1238 lsr $t1,$t4,#32 1239 sbcs $acc3,$acc3,$t0 1240 sbcs $acc4,$acc4,$t1 1241 sbc $acc5,$acc5,xzr 1242 1243 subs xzr,$acc0,#1 1244 umulh $t1,$ord0,$t4 1245 mul $t2,$ord1,$t4 1246 umulh $t3,$ord1,$t4 1247 1248 adcs $t2,$t2,$t1 1249 adc $t3,$t3,xzr 1250 1251 adds $acc0,$acc1,$t2 1252 adcs $acc1,$acc2,$t3 1253 adcs $acc2,$acc3,$t4 1254 adcs $acc3,$acc4,$t4 1255 adc $acc4,$acc5,xzr 1256 1257 subs $t0,$acc0,$ord0 // ret -= modulus 1258 sbcs $t1,$acc1,$ord1 1259 sbcs $t2,$acc2,$ord2 1260 sbcs $t3,$acc3,$ord3 1261 sbcs xzr,$acc4,xzr 1262 1263 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1264 csel $acc1,$acc1,$t1,lo 1265 csel $acc2,$acc2,$t2,lo 1266 stp $acc0,$acc1,[$rp] 1267 csel $acc3,$acc3,$t3,lo 1268 stp $acc2,$acc3,[$rp,#16] 1269 1270 ldp x19,x20,[sp,#16] 1271 ldp x21,x22,[sp,#32] 1272 ldp x23,x24,[sp,#48] 1273 ldr x29,[sp],#64 1274 ret 1275.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1276 1277//////////////////////////////////////////////////////////////////////// 1278// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1279// uint64_t rep); 1280.globl ecp_nistz256_ord_sqr_mont 1281.type ecp_nistz256_ord_sqr_mont,%function 1282.align 4 1283ecp_nistz256_ord_sqr_mont: 1284 AARCH64_VALID_CALL_TARGET 1285 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1286 stp x29,x30,[sp,#-64]! 1287 add x29,sp,#0 1288 stp x19,x20,[sp,#16] 1289 stp x21,x22,[sp,#32] 1290 stp x23,x24,[sp,#48] 1291 1292 adrp $ordk,:pg_hi21:.Lord 1293 add $ordk,$ordk,:lo12:.Lord 1294 ldp $a0,$a1,[$ap] 1295 ldp $a2,$a3,[$ap,#16] 1296 1297 ldp $ord0,$ord1,[$ordk,#0] 1298 ldp $ord2,$ord3,[$ordk,#16] 1299 ldr $ordk,[$ordk,#32] 1300 b .Loop_ord_sqr 1301 1302.align 4 1303.Loop_ord_sqr: 1304 sub $bp,$bp,#1 1305 //////////////////////////////////////////////////////////////// 1306 // | | | | | |a1*a0| | 1307 // | | | | |a2*a0| | | 1308 // | |a3*a2|a3*a0| | | | 1309 // | | | |a2*a1| | | | 1310 // | | |a3*a1| | | | | 1311 // *| | | | | | | | 2| 1312 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1313 // |--+--+--+--+--+--+--+--| 1314 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 1315 // 1316 // "can't overflow" below mark carrying into high part of 1317 // multiplication result, which can't overflow, because it 1318 // can never be all ones. 1319 1320 mul $acc1,$a1,$a0 // a[1]*a[0] 1321 umulh $t1,$a1,$a0 1322 mul $acc2,$a2,$a0 // a[2]*a[0] 1323 umulh $t2,$a2,$a0 1324 mul $acc3,$a3,$a0 // a[3]*a[0] 1325 umulh $acc4,$a3,$a0 1326 1327 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 1328 mul $t0,$a2,$a1 // a[2]*a[1] 1329 umulh $t1,$a2,$a1 1330 adcs $acc3,$acc3,$t2 1331 mul $t2,$a3,$a1 // a[3]*a[1] 1332 umulh $t3,$a3,$a1 1333 adc $acc4,$acc4,xzr // can't overflow 1334 1335 mul $acc5,$a3,$a2 // a[3]*a[2] 1336 umulh $acc6,$a3,$a2 1337 1338 adds $t1,$t1,$t2 // accumulate high parts of multiplication 1339 mul $acc0,$a0,$a0 // a[0]*a[0] 1340 adc $t2,$t3,xzr // can't overflow 1341 1342 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 1343 umulh $a0,$a0,$a0 1344 adcs $acc4,$acc4,$t1 1345 mul $t1,$a1,$a1 // a[1]*a[1] 1346 adcs $acc5,$acc5,$t2 1347 umulh $a1,$a1,$a1 1348 adc $acc6,$acc6,xzr // can't overflow 1349 1350 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 1351 mul $t2,$a2,$a2 // a[2]*a[2] 1352 adcs $acc2,$acc2,$acc2 1353 umulh $a2,$a2,$a2 1354 adcs $acc3,$acc3,$acc3 1355 mul $t3,$a3,$a3 // a[3]*a[3] 1356 adcs $acc4,$acc4,$acc4 1357 umulh $a3,$a3,$a3 1358 adcs $acc5,$acc5,$acc5 1359 adcs $acc6,$acc6,$acc6 1360 adc $acc7,xzr,xzr 1361 1362 adds $acc1,$acc1,$a0 // +a[i]*a[i] 1363 mul $t4,$acc0,$ordk 1364 adcs $acc2,$acc2,$t1 1365 adcs $acc3,$acc3,$a1 1366 adcs $acc4,$acc4,$t2 1367 adcs $acc5,$acc5,$a2 1368 adcs $acc6,$acc6,$t3 1369 adc $acc7,$acc7,$a3 1370___ 1371for($i=0; $i<4; $i++) { # reductions 1372$code.=<<___; 1373 subs xzr,$acc0,#1 1374 umulh $t1,$ord0,$t4 1375 mul $t2,$ord1,$t4 1376 umulh $t3,$ord1,$t4 1377 1378 adcs $t2,$t2,$t1 1379 adc $t3,$t3,xzr 1380 1381 adds $acc0,$acc1,$t2 1382 adcs $acc1,$acc2,$t3 1383 adcs $acc2,$acc3,$t4 1384 adc $acc3,xzr,$t4 // can't overflow 1385___ 1386$code.=<<___ if ($i<3); 1387 mul $t3,$acc0,$ordk 1388___ 1389$code.=<<___; 1390 lsl $t0,$t4,#32 1391 subs $acc1,$acc1,$t4 1392 lsr $t1,$t4,#32 1393 sbcs $acc2,$acc2,$t0 1394 sbc $acc3,$acc3,$t1 // can't borrow 1395___ 1396 ($t3,$t4) = ($t4,$t3); 1397} 1398$code.=<<___; 1399 adds $acc0,$acc0,$acc4 // accumulate upper half 1400 adcs $acc1,$acc1,$acc5 1401 adcs $acc2,$acc2,$acc6 1402 adcs $acc3,$acc3,$acc7 1403 adc $acc4,xzr,xzr 1404 1405 subs $t0,$acc0,$ord0 // ret -= modulus 1406 sbcs $t1,$acc1,$ord1 1407 sbcs $t2,$acc2,$ord2 1408 sbcs $t3,$acc3,$ord3 1409 sbcs xzr,$acc4,xzr 1410 1411 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 1412 csel $a1,$acc1,$t1,lo 1413 csel $a2,$acc2,$t2,lo 1414 csel $a3,$acc3,$t3,lo 1415 1416 cbnz $bp,.Loop_ord_sqr 1417 1418 stp $a0,$a1,[$rp] 1419 stp $a2,$a3,[$rp,#16] 1420 1421 ldp x19,x20,[sp,#16] 1422 ldp x21,x22,[sp,#32] 1423 ldp x23,x24,[sp,#48] 1424 ldr x29,[sp],#64 1425 ret 1426.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1427___ 1428} } 1429 1430######################################################################## 1431# select subroutines 1432# These select functions are similar to those in p256-x86_64-asm.pl 1433# They load all points in the lookup table 1434# keeping in the output only the one corresponding to the input index. 1435{ 1436my ($val,$in_t)=map("x$_",(0..1)); 1437my ($index)=("w2"); 1438my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11"); 1439my ($Mask)=("v3"); 1440my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21)); 1441my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27)); 1442$code.=<<___; 1443//////////////////////////////////////////////////////////////////////// 1444// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1445.globl ecp_nistz256_select_w5 1446.type ecp_nistz256_select_w5,%function 1447.align 4 1448ecp_nistz256_select_w5: 1449 AARCH64_VALID_CALL_TARGET 1450 1451 // $Val_in := $val 1452 // $Idx_ctr := 0; loop counter and incremented internal index 1453 mov $Val_in, $val 1454 mov $Idx_ctr, #0 1455 1456 // [$Ra-$Rf] := 0 1457 movi $Ra.16b, #0 1458 movi $Rb.16b, #0 1459 movi $Rc.16b, #0 1460 movi $Rd.16b, #0 1461 movi $Re.16b, #0 1462 movi $Rf.16b, #0 1463 1464.Lselect_w5_loop: 1465 // Loop 16 times. 1466 1467 // Increment index (loop counter); tested at the end of the loop 1468 add $Idx_ctr, $Idx_ctr, #1 1469 1470 // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t 1471 // and advance $in_t to point to the next entry 1472 ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 1473 1474 // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s 1475 cmp $Idx_ctr, $index 1476 csetm $Mask_64, eq 1477 1478 // continue loading ... 1479 ld1 {$T0e.2d, $T0f.2d}, [$in_t],#32 1480 1481 // duplicate mask_64 into Mask (all 0s or all 1s) 1482 dup $Mask.2d, $Mask_64 1483 1484 // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] 1485 // i.e., values in output registers will remain the same if $Idx_ctr != $index 1486 bit $Ra.16b, $T0a.16b, $Mask.16b 1487 bit $Rb.16b, $T0b.16b, $Mask.16b 1488 1489 bit $Rc.16b, $T0c.16b, $Mask.16b 1490 bit $Rd.16b, $T0d.16b, $Mask.16b 1491 1492 bit $Re.16b, $T0e.16b, $Mask.16b 1493 bit $Rf.16b, $T0f.16b, $Mask.16b 1494 1495 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1496 tbz $Idx_ctr, #4, .Lselect_w5_loop 1497 1498 // Write [$Ra-$Rf] to memory at the output pointer 1499 st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64 1500 st1 {$Re.2d, $Rf.2d}, [$Val_in] 1501 1502 ret 1503.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1504 1505 1506//////////////////////////////////////////////////////////////////////// 1507// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1508.globl ecp_nistz256_select_w7 1509.type ecp_nistz256_select_w7,%function 1510.align 4 1511ecp_nistz256_select_w7: 1512 AARCH64_VALID_CALL_TARGET 1513 1514 // $Idx_ctr := 0; loop counter and incremented internal index 1515 mov $Idx_ctr, #0 1516 1517 // [$Ra-$Rf] := 0 1518 movi $Ra.16b, #0 1519 movi $Rb.16b, #0 1520 movi $Rc.16b, #0 1521 movi $Rd.16b, #0 1522 1523.Lselect_w7_loop: 1524 // Loop 64 times. 1525 1526 // Increment index (loop counter); tested at the end of the loop 1527 add $Idx_ctr, $Idx_ctr, #1 1528 1529 // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t 1530 // and advance $in_t to point to the next entry 1531 ld1 {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64 1532 1533 // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s 1534 cmp $Idx_ctr, $index 1535 csetm $Mask_64, eq 1536 1537 // duplicate mask_64 into Mask (all 0s or all 1s) 1538 dup $Mask.2d, $Mask_64 1539 1540 // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd] 1541 // i.e., values in output registers will remain the same if $Idx_ctr != $index 1542 bit $Ra.16b, $T0a.16b, $Mask.16b 1543 bit $Rb.16b, $T0b.16b, $Mask.16b 1544 1545 bit $Rc.16b, $T0c.16b, $Mask.16b 1546 bit $Rd.16b, $T0d.16b, $Mask.16b 1547 1548 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1549 tbz $Idx_ctr, #6, .Lselect_w7_loop 1550 1551 // Write [$Ra-$Rd] to memory at the output pointer 1552 st1 {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val] 1553 1554 ret 1555.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1556___ 1557} 1558 1559foreach (split("\n",$code)) { 1560 s/\`([^\`]*)\`/eval $1/ge; 1561 1562 print $_,"\n"; 1563} 1564close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1565