1#! /usr/bin/env perl 2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# 8# 1. Redistributions of source code must retain the above copyright 9# notice, this list of conditions and the following disclaimer. 10# 11# 2. Redistributions in binary form must reproduce the above copyright 12# notice, this list of conditions and the following disclaimer in 13# the documentation and/or other materials provided with the 14# distribution. 15# 16# 3. All advertising materials mentioning features or use of this 17# software must display the following acknowledgment: 18# "This product includes software developed by the OpenSSL Project 19# for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 20# 21# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 22# endorse or promote products derived from this software without 23# prior written permission. For written permission, please contact 24# openssl-core@openssl.org. 25# 26# 5. Products derived from this software may not be called "OpenSSL" 27# nor may "OpenSSL" appear in their names without prior written 28# permission of the OpenSSL Project. 29# 30# 6. Redistributions of any form whatsoever must retain the following 31# acknowledgment: 32# "This product includes software developed by the OpenSSL Project 33# for use in the OpenSSL Toolkit (http://www.openssl.org/)" 34# 35# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 36# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 38# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 39# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 40# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 41# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 42# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 43# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 44# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 45# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 46# OF THE POSSIBILITY OF SUCH DAMAGE. 47# ==================================================================== 48# 49# This product includes cryptographic software written by Eric Young 50# (eay@cryptsoft.com). This product includes software written by Tim 51# Hudson (tjh@cryptsoft.com). 52 53 54# ==================================================================== 55# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 56# project. The module is, however, dual licensed under OpenSSL and 57# CRYPTOGAMS licenses depending on where you obtain it. For further 58# details see http://www.openssl.org/~appro/cryptogams/. 59# ==================================================================== 60# 61# ECP_NISTZ256 module for ARMv8. 62# 63# February 2015. 64# 65# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 66# http://eprint.iacr.org/2013/816. 67# 68# with/without -DECP_NISTZ256_ASM 69# Apple A7 +120-360% 70# Cortex-A53 +120-400% 71# Cortex-A57 +120-350% 72# X-Gene +200-330% 73# Denver +140-400% 74# 75# Ranges denote minimum and maximum improvement coefficients depending 76# on benchmark. Lower coefficients are for ECDSA sign, server-side 77# operation. Keep in mind that +400% means 5x improvement. 78 79$flavour = shift; 80while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 81 82$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 83( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 84( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 85die "can't locate arm-xlate.pl"; 86 87open OUT,"| \"$^X\" $xlate $flavour $output"; 88*STDOUT=*OUT; 89 90{ 91my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, 92 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = 93 map("x$_",(0..17,19,20)); 94 95my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont 96 97$code.=<<___; 98#include <GFp/arm_arch.h> 99 100.text 101.align 5 102.Lpoly: 103.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 104.Lone_mont: 105.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 106.Lone: 107.quad 1,0,0,0 108.asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 109 110// void GFp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 111// const BN_ULONG x2[4]); 112.globl GFp_nistz256_mul_mont 113.type GFp_nistz256_mul_mont,%function 114.align 4 115GFp_nistz256_mul_mont: 116 stp x29,x30,[sp,#-32]! 117 add x29,sp,#0 118 stp x19,x20,[sp,#16] 119 120 ldr $bi,[$bp] // bp[0] 121 ldp $a0,$a1,[$ap] 122 ldp $a2,$a3,[$ap,#16] 123 ldr $poly1,.Lpoly+8 124 ldr $poly3,.Lpoly+24 125 126 bl __ecp_nistz256_mul_mont 127 128 ldp x19,x20,[sp,#16] 129 ldp x29,x30,[sp],#32 130 ret 131.size GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont 132 133// void GFp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 134.globl GFp_nistz256_sqr_mont 135.type GFp_nistz256_sqr_mont,%function 136.align 4 137GFp_nistz256_sqr_mont: 138 stp x29,x30,[sp,#-32]! 139 add x29,sp,#0 140 stp x19,x20,[sp,#16] 141 142 ldp $a0,$a1,[$ap] 143 ldp $a2,$a3,[$ap,#16] 144 ldr $poly1,.Lpoly+8 145 ldr $poly3,.Lpoly+24 146 147 bl __ecp_nistz256_sqr_mont 148 149 ldp x19,x20,[sp,#16] 150 ldp x29,x30,[sp],#32 151 ret 152.size GFp_nistz256_sqr_mont,.-GFp_nistz256_sqr_mont 153 154// void GFp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], 155// const BN_ULONG x2[4]); 156.globl GFp_nistz256_add 157.type GFp_nistz256_add,%function 158.align 4 159GFp_nistz256_add: 160 stp x29,x30,[sp,#-16]! 161 add x29,sp,#0 162 163 ldp $acc0,$acc1,[$ap] 164 ldp $t0,$t1,[$bp] 165 ldp $acc2,$acc3,[$ap,#16] 166 ldp $t2,$t3,[$bp,#16] 167 ldr $poly1,.Lpoly+8 168 ldr $poly3,.Lpoly+24 169 170 bl __ecp_nistz256_add 171 172 ldp x29,x30,[sp],#16 173 ret 174.size GFp_nistz256_add,.-GFp_nistz256_add 175 176// void GFp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 177.globl GFp_nistz256_neg 178.type GFp_nistz256_neg,%function 179.align 4 180GFp_nistz256_neg: 181 stp x29,x30,[sp,#-16]! 182 add x29,sp,#0 183 184 mov $bp,$ap 185 mov $acc0,xzr // a = 0 186 mov $acc1,xzr 187 mov $acc2,xzr 188 mov $acc3,xzr 189 ldr $poly1,.Lpoly+8 190 ldr $poly3,.Lpoly+24 191 192 bl __ecp_nistz256_sub_from 193 194 ldp x29,x30,[sp],#16 195 ret 196.size GFp_nistz256_neg,.-GFp_nistz256_neg 197 198// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 199// to $a0-$a3 and b[0] - to $bi 200.type __ecp_nistz256_mul_mont,%function 201.align 4 202__ecp_nistz256_mul_mont: 203 mul $acc0,$a0,$bi // a[0]*b[0] 204 umulh $t0,$a0,$bi 205 206 mul $acc1,$a1,$bi // a[1]*b[0] 207 umulh $t1,$a1,$bi 208 209 mul $acc2,$a2,$bi // a[2]*b[0] 210 umulh $t2,$a2,$bi 211 212 mul $acc3,$a3,$bi // a[3]*b[0] 213 umulh $t3,$a3,$bi 214 ldr $bi,[$bp,#8] // b[1] 215 216 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 217 lsl $t0,$acc0,#32 218 adcs $acc2,$acc2,$t1 219 lsr $t1,$acc0,#32 220 adcs $acc3,$acc3,$t2 221 adc $acc4,xzr,$t3 222 mov $acc5,xzr 223___ 224for($i=1;$i<4;$i++) { 225 # Reduction iteration is normally performed by accumulating 226 # result of multiplication of modulus by "magic" digit [and 227 # omitting least significant word, which is guaranteed to 228 # be 0], but thanks to special form of modulus and "magic" 229 # digit being equal to least significant word, it can be 230 # performed with additions and subtractions alone. Indeed: 231 # 232 # ffff0001.00000000.0000ffff.ffffffff 233 # * abcdefgh 234 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 235 # 236 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 237 # rewrite above as: 238 # 239 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh 240 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 241 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh 242 # 243 # or marking redundant operations: 244 # 245 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- 246 # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- 247 # - 0000abcd.efgh0000.--------.--------.-------- 248 249$code.=<<___; 250 subs $t2,$acc0,$t0 // "*0xffff0001" 251 sbc $t3,$acc0,$t1 252 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 253 mul $t0,$a0,$bi // lo(a[0]*b[i]) 254 adcs $acc1,$acc2,$t1 255 mul $t1,$a1,$bi // lo(a[1]*b[i]) 256 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 257 mul $t2,$a2,$bi // lo(a[2]*b[i]) 258 adcs $acc3,$acc4,$t3 259 mul $t3,$a3,$bi // lo(a[3]*b[i]) 260 adc $acc4,$acc5,xzr 261 262 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication 263 umulh $t0,$a0,$bi // hi(a[0]*b[i]) 264 adcs $acc1,$acc1,$t1 265 umulh $t1,$a1,$bi // hi(a[1]*b[i]) 266 adcs $acc2,$acc2,$t2 267 umulh $t2,$a2,$bi // hi(a[2]*b[i]) 268 adcs $acc3,$acc3,$t3 269 umulh $t3,$a3,$bi // hi(a[3]*b[i]) 270 adc $acc4,$acc4,xzr 271___ 272$code.=<<___ if ($i<3); 273 ldr $bi,[$bp,#8*($i+1)] // b[$i+1] 274___ 275$code.=<<___; 276 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication 277 lsl $t0,$acc0,#32 278 adcs $acc2,$acc2,$t1 279 lsr $t1,$acc0,#32 280 adcs $acc3,$acc3,$t2 281 adcs $acc4,$acc4,$t3 282 adc $acc5,xzr,xzr 283___ 284} 285$code.=<<___; 286 // last reduction 287 subs $t2,$acc0,$t0 // "*0xffff0001" 288 sbc $t3,$acc0,$t1 289 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 290 adcs $acc1,$acc2,$t1 291 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 292 adcs $acc3,$acc4,$t3 293 adc $acc4,$acc5,xzr 294 295 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 296 sbcs $t1,$acc1,$poly1 297 sbcs $t2,$acc2,xzr 298 sbcs $t3,$acc3,$poly3 299 sbcs xzr,$acc4,xzr // did it borrow? 300 301 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 302 csel $acc1,$acc1,$t1,lo 303 csel $acc2,$acc2,$t2,lo 304 stp $acc0,$acc1,[$rp] 305 csel $acc3,$acc3,$t3,lo 306 stp $acc2,$acc3,[$rp,#16] 307 308 ret 309.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 310 311// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 312// to $a0-$a3 313.type __ecp_nistz256_sqr_mont,%function 314.align 4 315__ecp_nistz256_sqr_mont: 316 // | | | | | |a1*a0| | 317 // | | | | |a2*a0| | | 318 // | |a3*a2|a3*a0| | | | 319 // | | | |a2*a1| | | | 320 // | | |a3*a1| | | | | 321 // *| | | | | | | | 2| 322 // +|a3*a3|a2*a2|a1*a1|a0*a0| 323 // |--+--+--+--+--+--+--+--| 324 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx 325 // 326 // "can't overflow" below mark carrying into high part of 327 // multiplication result, which can't overflow, because it 328 // can never be all ones. 329 330 mul $acc1,$a1,$a0 // a[1]*a[0] 331 umulh $t1,$a1,$a0 332 mul $acc2,$a2,$a0 // a[2]*a[0] 333 umulh $t2,$a2,$a0 334 mul $acc3,$a3,$a0 // a[3]*a[0] 335 umulh $acc4,$a3,$a0 336 337 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication 338 mul $t0,$a2,$a1 // a[2]*a[1] 339 umulh $t1,$a2,$a1 340 adcs $acc3,$acc3,$t2 341 mul $t2,$a3,$a1 // a[3]*a[1] 342 umulh $t3,$a3,$a1 343 adc $acc4,$acc4,xzr // can't overflow 344 345 mul $acc5,$a3,$a2 // a[3]*a[2] 346 umulh $acc6,$a3,$a2 347 348 adds $t1,$t1,$t2 // accumulate high parts of multiplication 349 mul $acc0,$a0,$a0 // a[0]*a[0] 350 adc $t2,$t3,xzr // can't overflow 351 352 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication 353 umulh $a0,$a0,$a0 354 adcs $acc4,$acc4,$t1 355 mul $t1,$a1,$a1 // a[1]*a[1] 356 adcs $acc5,$acc5,$t2 357 umulh $a1,$a1,$a1 358 adc $acc6,$acc6,xzr // can't overflow 359 360 adds $acc1,$acc1,$acc1 // acc[1-6]*=2 361 mul $t2,$a2,$a2 // a[2]*a[2] 362 adcs $acc2,$acc2,$acc2 363 umulh $a2,$a2,$a2 364 adcs $acc3,$acc3,$acc3 365 mul $t3,$a3,$a3 // a[3]*a[3] 366 adcs $acc4,$acc4,$acc4 367 umulh $a3,$a3,$a3 368 adcs $acc5,$acc5,$acc5 369 adcs $acc6,$acc6,$acc6 370 adc $acc7,xzr,xzr 371 372 adds $acc1,$acc1,$a0 // +a[i]*a[i] 373 adcs $acc2,$acc2,$t1 374 adcs $acc3,$acc3,$a1 375 adcs $acc4,$acc4,$t2 376 adcs $acc5,$acc5,$a2 377 lsl $t0,$acc0,#32 378 adcs $acc6,$acc6,$t3 379 lsr $t1,$acc0,#32 380 adc $acc7,$acc7,$a3 381___ 382for($i=0;$i<3;$i++) { # reductions, see commentary in 383 # multiplication for details 384$code.=<<___; 385 subs $t2,$acc0,$t0 // "*0xffff0001" 386 sbc $t3,$acc0,$t1 387 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 388 adcs $acc1,$acc2,$t1 389 lsl $t0,$acc0,#32 390 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 391 lsr $t1,$acc0,#32 392 adc $acc3,$t3,xzr // can't overflow 393___ 394} 395$code.=<<___; 396 subs $t2,$acc0,$t0 // "*0xffff0001" 397 sbc $t3,$acc0,$t1 398 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] 399 adcs $acc1,$acc2,$t1 400 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 401 adc $acc3,$t3,xzr // can't overflow 402 403 adds $acc0,$acc0,$acc4 // accumulate upper half 404 adcs $acc1,$acc1,$acc5 405 adcs $acc2,$acc2,$acc6 406 adcs $acc3,$acc3,$acc7 407 adc $acc4,xzr,xzr 408 409 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus 410 sbcs $t1,$acc1,$poly1 411 sbcs $t2,$acc2,xzr 412 sbcs $t3,$acc3,$poly3 413 sbcs xzr,$acc4,xzr // did it borrow? 414 415 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 416 csel $acc1,$acc1,$t1,lo 417 csel $acc2,$acc2,$t2,lo 418 stp $acc0,$acc1,[$rp] 419 csel $acc3,$acc3,$t3,lo 420 stp $acc2,$acc3,[$rp,#16] 421 422 ret 423.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 424 425// Note that __ecp_nistz256_add expects both input vectors pre-loaded to 426// $a0-$a3 and $t0-$t3. This is done because it's used in multiple 427// contexts, e.g. in multiplication by 2 and 3... 428.type __ecp_nistz256_add,%function 429.align 4 430__ecp_nistz256_add: 431 adds $acc0,$acc0,$t0 // ret = a+b 432 adcs $acc1,$acc1,$t1 433 adcs $acc2,$acc2,$t2 434 adcs $acc3,$acc3,$t3 435 adc $ap,xzr,xzr // zap $ap 436 437 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus 438 sbcs $t1,$acc1,$poly1 439 sbcs $t2,$acc2,xzr 440 sbcs $t3,$acc3,$poly3 441 sbcs xzr,$ap,xzr // did subtraction borrow? 442 443 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus 444 csel $acc1,$acc1,$t1,lo 445 csel $acc2,$acc2,$t2,lo 446 stp $acc0,$acc1,[$rp] 447 csel $acc3,$acc3,$t3,lo 448 stp $acc2,$acc3,[$rp,#16] 449 450 ret 451.size __ecp_nistz256_add,.-__ecp_nistz256_add 452 453.type __ecp_nistz256_sub_from,%function 454.align 4 455__ecp_nistz256_sub_from: 456 ldp $t0,$t1,[$bp] 457 ldp $t2,$t3,[$bp,#16] 458 subs $acc0,$acc0,$t0 // ret = a-b 459 sbcs $acc1,$acc1,$t1 460 sbcs $acc2,$acc2,$t2 461 sbcs $acc3,$acc3,$t3 462 sbc $ap,xzr,xzr // zap $ap 463 464 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 465 adcs $t1,$acc1,$poly1 466 adcs $t2,$acc2,xzr 467 adc $t3,$acc3,$poly3 468 cmp $ap,xzr // did subtraction borrow? 469 470 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 471 csel $acc1,$acc1,$t1,eq 472 csel $acc2,$acc2,$t2,eq 473 stp $acc0,$acc1,[$rp] 474 csel $acc3,$acc3,$t3,eq 475 stp $acc2,$acc3,[$rp,#16] 476 477 ret 478.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 479 480.type __ecp_nistz256_sub_morf,%function 481.align 4 482__ecp_nistz256_sub_morf: 483 ldp $t0,$t1,[$bp] 484 ldp $t2,$t3,[$bp,#16] 485 subs $acc0,$t0,$acc0 // ret = b-a 486 sbcs $acc1,$t1,$acc1 487 sbcs $acc2,$t2,$acc2 488 sbcs $acc3,$t3,$acc3 489 sbc $ap,xzr,xzr // zap $ap 490 491 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus 492 adcs $t1,$acc1,$poly1 493 adcs $t2,$acc2,xzr 494 adc $t3,$acc3,$poly3 495 cmp $ap,xzr // did subtraction borrow? 496 497 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret 498 csel $acc1,$acc1,$t1,eq 499 csel $acc2,$acc2,$t2,eq 500 stp $acc0,$acc1,[$rp] 501 csel $acc3,$acc3,$t3,eq 502 stp $acc2,$acc3,[$rp,#16] 503 504 ret 505.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 506 507.type __ecp_nistz256_div_by_2,%function 508.align 4 509__ecp_nistz256_div_by_2: 510 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus 511 adcs $t1,$acc1,$poly1 512 adcs $t2,$acc2,xzr 513 adcs $t3,$acc3,$poly3 514 adc $ap,xzr,xzr // zap $ap 515 tst $acc0,#1 // is a even? 516 517 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus 518 csel $acc1,$acc1,$t1,eq 519 csel $acc2,$acc2,$t2,eq 520 csel $acc3,$acc3,$t3,eq 521 csel $ap,xzr,$ap,eq 522 523 lsr $acc0,$acc0,#1 // ret >>= 1 524 orr $acc0,$acc0,$acc1,lsl#63 525 lsr $acc1,$acc1,#1 526 orr $acc1,$acc1,$acc2,lsl#63 527 lsr $acc2,$acc2,#1 528 orr $acc2,$acc2,$acc3,lsl#63 529 lsr $acc3,$acc3,#1 530 stp $acc0,$acc1,[$rp] 531 orr $acc3,$acc3,$ap,lsl#63 532 stp $acc2,$acc3,[$rp,#16] 533 534 ret 535.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 536___ 537######################################################################## 538# following subroutines are "literal" implementation of those found in 539# ecp_nistz256.c 540# 541######################################################################## 542# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 543# 544{ 545my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); 546# above map() describes stack layout with 4 temporary 547# 256-bit vectors on top. 548my ($rp_real,$ap_real) = map("x$_",(21,22)); 549 550$code.=<<___; 551.globl GFp_nistz256_point_double 552.type GFp_nistz256_point_double,%function 553.align 5 554GFp_nistz256_point_double: 555 stp x29,x30,[sp,#-80]! 556 add x29,sp,#0 557 stp x19,x20,[sp,#16] 558 stp x21,x22,[sp,#32] 559 sub sp,sp,#32*4 560 561.Ldouble_shortcut: 562 ldp $acc0,$acc1,[$ap,#32] 563 mov $rp_real,$rp 564 ldp $acc2,$acc3,[$ap,#48] 565 mov $ap_real,$ap 566 ldr $poly1,.Lpoly+8 567 mov $t0,$acc0 568 ldr $poly3,.Lpoly+24 569 mov $t1,$acc1 570 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont 571 mov $t2,$acc2 572 mov $t3,$acc3 573 ldp $a2,$a3,[$ap_real,#64+16] 574 add $rp,sp,#$S 575 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); 576 577 add $rp,sp,#$Zsqr 578 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 579 580 ldp $t0,$t1,[$ap_real] 581 ldp $t2,$t3,[$ap_real,#16] 582 mov $a0,$acc0 // put Zsqr aside for p256_sub 583 mov $a1,$acc1 584 mov $a2,$acc2 585 mov $a3,$acc3 586 add $rp,sp,#$M 587 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); 588 589 add $bp,$ap_real,#0 590 mov $acc0,$a0 // restore Zsqr 591 mov $acc1,$a1 592 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 593 mov $acc2,$a2 594 mov $acc3,$a3 595 ldp $a2,$a3,[sp,#$S+16] 596 add $rp,sp,#$Zsqr 597 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 598 599 add $rp,sp,#$S 600 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 601 602 ldr $bi,[$ap_real,#32] 603 ldp $a0,$a1,[$ap_real,#64] 604 ldp $a2,$a3,[$ap_real,#64+16] 605 add $bp,$ap_real,#32 606 add $rp,sp,#$tmp0 607 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 608 609 mov $t0,$acc0 610 mov $t1,$acc1 611 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont 612 mov $t2,$acc2 613 mov $t3,$acc3 614 ldp $a2,$a3,[sp,#$S+16] 615 add $rp,$rp_real,#64 616 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); 617 618 add $rp,sp,#$tmp0 619 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 620 621 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont 622 ldp $a0,$a1,[sp,#$M] 623 ldp $a2,$a3,[sp,#$M+16] 624 add $rp,$rp_real,#32 625 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 626 627 add $bp,sp,#$Zsqr 628 add $rp,sp,#$M 629 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 630 631 mov $t0,$acc0 // duplicate M 632 mov $t1,$acc1 633 mov $t2,$acc2 634 mov $t3,$acc3 635 mov $a0,$acc0 // put M aside 636 mov $a1,$acc1 637 mov $a2,$acc2 638 mov $a3,$acc3 639 add $rp,sp,#$M 640 bl __ecp_nistz256_add 641 mov $t0,$a0 // restore M 642 mov $t1,$a1 643 ldr $bi,[$ap_real] // forward load for p256_mul_mont 644 mov $t2,$a2 645 ldp $a0,$a1,[sp,#$S] 646 mov $t3,$a3 647 ldp $a2,$a3,[sp,#$S+16] 648 bl __ecp_nistz256_add // p256_mul_by_3(M, M); 649 650 add $bp,$ap_real,#0 651 add $rp,sp,#$S 652 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 653 654 mov $t0,$acc0 655 mov $t1,$acc1 656 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont 657 mov $t2,$acc2 658 mov $t3,$acc3 659 ldp $a2,$a3,[sp,#$M+16] 660 add $rp,sp,#$tmp0 661 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); 662 663 add $rp,$rp_real,#0 664 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 665 666 add $bp,sp,#$tmp0 667 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 668 669 add $bp,sp,#$S 670 add $rp,sp,#$S 671 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 672 673 ldr $bi,[sp,#$M] 674 mov $a0,$acc0 // copy S 675 mov $a1,$acc1 676 mov $a2,$acc2 677 mov $a3,$acc3 678 add $bp,sp,#$M 679 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 680 681 add $bp,$rp_real,#32 682 add $rp,$rp_real,#32 683 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 684 685 add sp,x29,#0 // destroy frame 686 ldp x19,x20,[x29,#16] 687 ldp x21,x22,[x29,#32] 688 ldp x29,x30,[sp],#80 689 ret 690.size GFp_nistz256_point_double,.-GFp_nistz256_point_double 691___ 692} 693 694######################################################################## 695# void GFp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 696# const P256_POINT_AFFINE *in2); 697{ 698my ($res_x,$res_y,$res_z, 699 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); 700my $Z1sqr = $S2; 701# above map() describes stack layout with 10 temporary 702# 256-bit vectors on top. 703my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); 704 705$code.=<<___; 706.globl GFp_nistz256_point_add_affine 707.type GFp_nistz256_point_add_affine,%function 708.align 5 709GFp_nistz256_point_add_affine: 710 stp x29,x30,[sp,#-80]! 711 add x29,sp,#0 712 stp x19,x20,[sp,#16] 713 stp x21,x22,[sp,#32] 714 stp x23,x24,[sp,#48] 715 stp x25,x26,[sp,#64] 716 sub sp,sp,#32*10 717 718 mov $rp_real,$rp 719 mov $ap_real,$ap 720 mov $bp_real,$bp 721 ldr $poly1,.Lpoly+8 722 ldr $poly3,.Lpoly+24 723 724 ldp $a0,$a1,[$ap,#64] // in1_z 725 ldp $a2,$a3,[$ap,#64+16] 726 orr $t0,$a0,$a1 727 orr $t2,$a2,$a3 728 orr $in1infty,$t0,$t2 729 cmp $in1infty,#0 730 csetm $in1infty,ne // !in1infty 731 732 ldp $acc0,$acc1,[$bp] // in2_x 733 ldp $acc2,$acc3,[$bp,#16] 734 ldp $t0,$t1,[$bp,#32] // in2_y 735 ldp $t2,$t3,[$bp,#48] 736 orr $acc0,$acc0,$acc1 737 orr $acc2,$acc2,$acc3 738 orr $t0,$t0,$t1 739 orr $t2,$t2,$t3 740 orr $acc0,$acc0,$acc2 741 orr $t0,$t0,$t2 742 orr $in2infty,$acc0,$t0 743 cmp $in2infty,#0 744 csetm $in2infty,ne // !in2infty 745 746 add $rp,sp,#$Z1sqr 747 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 748 749 mov $a0,$acc0 750 mov $a1,$acc1 751 mov $a2,$acc2 752 mov $a3,$acc3 753 ldr $bi,[$bp_real] 754 add $bp,$bp_real,#0 755 add $rp,sp,#$U2 756 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 757 758 add $bp,$ap_real,#0 759 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont 760 ldp $a0,$a1,[sp,#$Z1sqr] 761 ldp $a2,$a3,[sp,#$Z1sqr+16] 762 add $rp,sp,#$H 763 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 764 765 add $bp,$ap_real,#64 766 add $rp,sp,#$S2 767 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 768 769 ldr $bi,[$ap_real,#64] 770 ldp $a0,$a1,[sp,#$H] 771 ldp $a2,$a3,[sp,#$H+16] 772 add $bp,$ap_real,#64 773 add $rp,sp,#$res_z 774 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 775 776 ldr $bi,[$bp_real,#32] 777 ldp $a0,$a1,[sp,#$S2] 778 ldp $a2,$a3,[sp,#$S2+16] 779 add $bp,$bp_real,#32 780 add $rp,sp,#$S2 781 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 782 783 add $bp,$ap_real,#32 784 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont 785 ldp $a2,$a3,[sp,#$H+16] 786 add $rp,sp,#$R 787 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 788 789 add $rp,sp,#$Hsqr 790 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 791 792 ldp $a0,$a1,[sp,#$R] 793 ldp $a2,$a3,[sp,#$R+16] 794 add $rp,sp,#$Rsqr 795 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 796 797 ldr $bi,[sp,#$H] 798 ldp $a0,$a1,[sp,#$Hsqr] 799 ldp $a2,$a3,[sp,#$Hsqr+16] 800 add $bp,sp,#$H 801 add $rp,sp,#$Hcub 802 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 803 804 ldr $bi,[$ap_real] 805 ldp $a0,$a1,[sp,#$Hsqr] 806 ldp $a2,$a3,[sp,#$Hsqr+16] 807 add $bp,$ap_real,#0 808 add $rp,sp,#$U2 809 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 810 811 mov $t0,$acc0 812 mov $t1,$acc1 813 mov $t2,$acc2 814 mov $t3,$acc3 815 add $rp,sp,#$Hsqr 816 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); 817 818 add $bp,sp,#$Rsqr 819 add $rp,sp,#$res_x 820 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 821 822 add $bp,sp,#$Hcub 823 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 824 825 add $bp,sp,#$U2 826 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont 827 ldp $a0,$a1,[sp,#$Hcub] 828 ldp $a2,$a3,[sp,#$Hcub+16] 829 add $rp,sp,#$res_y 830 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 831 832 add $bp,$ap_real,#32 833 add $rp,sp,#$S2 834 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 835 836 ldr $bi,[sp,#$R] 837 ldp $a0,$a1,[sp,#$res_y] 838 ldp $a2,$a3,[sp,#$res_y+16] 839 add $bp,sp,#$R 840 add $rp,sp,#$res_y 841 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 842 843 add $bp,sp,#$S2 844 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 845 846 ldp $a0,$a1,[sp,#$res_x] // res 847 ldp $a2,$a3,[sp,#$res_x+16] 848 ldp $t0,$t1,[$bp_real] // in2 849 ldp $t2,$t3,[$bp_real,#16] 850___ 851for($i=0;$i<64;$i+=32) { # conditional moves 852$code.=<<___; 853 ldp $acc0,$acc1,[$ap_real,#$i] // in1 854 cmp $in1infty,#0 // !$in1intfy, remember? 855 ldp $acc2,$acc3,[$ap_real,#$i+16] 856 csel $t0,$a0,$t0,ne 857 csel $t1,$a1,$t1,ne 858 ldp $a0,$a1,[sp,#$res_x+$i+32] // res 859 csel $t2,$a2,$t2,ne 860 csel $t3,$a3,$t3,ne 861 cmp $in2infty,#0 // !$in2intfy, remember? 862 ldp $a2,$a3,[sp,#$res_x+$i+48] 863 csel $acc0,$t0,$acc0,ne 864 csel $acc1,$t1,$acc1,ne 865 ldp $t0,$t1,[$bp_real,#$i+32] // in2 866 csel $acc2,$t2,$acc2,ne 867 csel $acc3,$t3,$acc3,ne 868 ldp $t2,$t3,[$bp_real,#$i+48] 869 stp $acc0,$acc1,[$rp_real,#$i] 870 stp $acc2,$acc3,[$rp_real,#$i+16] 871___ 872$code.=<<___ if ($i == 0); 873 adr $bp_real,.Lone_mont-64 874___ 875} 876$code.=<<___; 877 ldp $acc0,$acc1,[$ap_real,#$i] // in1 878 cmp $in1infty,#0 // !$in1intfy, remember? 879 ldp $acc2,$acc3,[$ap_real,#$i+16] 880 csel $t0,$a0,$t0,ne 881 csel $t1,$a1,$t1,ne 882 csel $t2,$a2,$t2,ne 883 csel $t3,$a3,$t3,ne 884 cmp $in2infty,#0 // !$in2intfy, remember? 885 csel $acc0,$t0,$acc0,ne 886 csel $acc1,$t1,$acc1,ne 887 csel $acc2,$t2,$acc2,ne 888 csel $acc3,$t3,$acc3,ne 889 stp $acc0,$acc1,[$rp_real,#$i] 890 stp $acc2,$acc3,[$rp_real,#$i+16] 891 892 add sp,x29,#0 // destroy frame 893 ldp x19,x20,[x29,#16] 894 ldp x21,x22,[x29,#32] 895 ldp x23,x24,[x29,#48] 896 ldp x25,x26,[x29,#64] 897 ldp x29,x30,[sp],#80 898 ret 899.size GFp_nistz256_point_add_affine,.-GFp_nistz256_point_add_affine 900___ 901} } 902 903foreach (split("\n",$code)) { 904 s/\`([^\`]*)\`/eval $1/ge; 905 906 print $_,"\n"; 907} 908close STDOUT or die "error closing STDOUT"; 909