1# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. 2# 3# Permission to use, copy, modify, and/or distribute this software for any 4# purpose with or without fee is hereby granted, provided that the above 5# copyright notice and this permission notice appear in all copies. 6# 7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14# 15# 16# This code is based on p256_beeu-x86_64-asm.pl (which is based on BN_mod_inverse_odd). 17# 18 19# The first two arguments should always be the flavour and output file path. 20if ($#ARGV < 1) { die "Not enough arguments provided. 21 Two arguments are necessary: the flavour and the output file path."; } 22 23$flavour = shift; 24$output = shift; 25 26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 28( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 29die "can't locate arm-xlate.pl"; 30 31open OUT,"| \"$^X\" $xlate $flavour $output"; 32*STDOUT=*OUT; 33############################################################################# 34# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], 35# BN_ULONG a[P256_LIMBS], 36# BN_ULONG n[P256_LIMBS]); 37# 38# (Binary Extended GCD (Euclidean) Algorithm. 39# See A. Menezes, P. vanOorschot, and S. Vanstone's Handbook of Applied Cryptography, 40# Chapter 14, Algorithm 14.61 and Note 14.64 41# http://cacr.uwaterloo.ca/hac/about/chap14.pdf) 42 43# Assumption 1: n is odd for the BEEU 44# Assumption 2: 1 < a < n < 2^256 45 46# Details 47# The inverse of x modulo y can be calculated using Alg. 14.61, where "a" would be that inverse. 48# In other words, 49# ax == 1 (mod y) (where the symbol “==“ denotes ”congruent“) 50# a == x^{-1} (mod y) 51# 52# It can be shown that throughout all the iterations of the algorithm, the following holds: 53# u = Ax + By 54# v = Cx + Dy 55# The values B and D are not of interest in this case, so they need not be computed by the algorithm. 56# This means the following congruences hold through the iterations of the algorithm. 57# Ax == u (mod y) 58# Cx == v (mod y) 59 60# Now we will modify the notation to match that of BN_mod_inverse_odd() 61# on which beeu_mod_inverse_vartime() in `p256_beeu-x86_64-asm` is based. 62# In those functions: 63# x, y -> a, n 64# u, v -> B, A 65# A, C -> X, Y’, where Y’ = -Y 66# Hence, the following holds throughout the algorithm iterations 67# Xa == B (mod n) 68# -Ya == A (mod n) 69# 70# Same algorithm in Python: 71# def beeu(a, n): 72# X = 1 73# Y = 0 74# B = a 75# A = n 76# while (B != 0): 77# while (B % 2) == 0: 78# B >>= 1 79# if (X % 2) == 1: 80# X = X + n 81# X >>= 1 82# while (A % 2) == 0: 83# A >>= 1 84# if (Y % 2) == 1: 85# Y = Y + n 86# Y >>= 1 87# if (B >= A): 88# B = B - A 89# X = X + Y 90# else: 91# A = A - B 92# Y = Y + X 93# if (A != 1): 94# # error 95# return 0 96# else: 97# while (Y > n): 98# Y = Y - n 99# Y = n - Y 100# return Y 101 102 103# For the internal variables, 104# x0-x2, x30 are used to hold the modulus n. The input parameters passed in 105# x1,x2 are copied first before corrupting them. x0 (out) is stored on the stack. 106# x3-x7 are used for parameters, which is not the case in this function, so they are corruptible 107# x8 is corruptible here 108# (the function doesn't return a struct, hence x8 doesn't contain a passed-in address 109# for that struct). 110# x9-x15 are corruptible registers 111# x19-x28 are callee-saved registers 112 113# X/Y will hold the inverse parameter 114# Assumption: a,n,X,Y < 2^(256) 115# Initially, X := 1, Y := 0 116# A := n, B := a 117 118# Function parameters (as per the Procedure Call Standard) 119my($out, $a_in, $n_in)=map("x$_",(0..2)); 120# Internal variables 121my($n0, $n1, $n2, $n3)=map("x$_",(0..2,30)); 122my($x0, $x1, $x2, $x3, $x4)=map("x$_",(3..7)); 123my($y0, $y1, $y2, $y3, $y4)=map("x$_",(8..12)); 124my($shift)=("x13"); 125my($t0, $t1, $t2, $t3)=map("x$_",(14,15,19,20)); 126my($a0, $a1, $a2, $a3)=map("x$_",(21..24)); 127my($b0, $b1, $b2, $b3)=map("x$_",(25..28)); 128 129# if B == 0, jump to end of loop 130sub TEST_B_ZERO { 131 return <<___; 132 orr $t0, $b0, $b1 133 orr $t0, $t0, $b2 134 135 // reverse the bit order of $b0. This is needed for clz after this macro 136 rbit $t1, $b0 137 138 orr $t0, $t0, $b3 139 cbz $t0,.Lbeeu_loop_end 140___ 141} 142 143# Shift right by 1 bit, adding the modulus first if the variable is odd 144# if least_sig_bit(var0) == 0, 145# goto shift1_<ctr> 146# else 147# add n and goto shift1_<ctr> 148# Prerequisite: t0 = 0 149$g_next_label = 0; 150sub SHIFT1 { 151 my ($var0, $var1, $var2, $var3, $var4) = @_; 152 my $label = ".Lshift1_${g_next_label}"; 153 $g_next_label++; 154 return <<___; 155 tbz $var0, #0, $label 156 adds $var0, $var0, $n0 157 adcs $var1, $var1, $n1 158 adcs $var2, $var2, $n2 159 adcs $var3, $var3, $n3 160 adc $var4, $var4, $t0 161$label: 162 // var0 := [var1|var0]<64..1>; 163 // i.e. concatenate var1 and var0, 164 // extract bits <64..1> from the resulting 128-bit value 165 // and put them in var0 166 extr $var0, $var1, $var0, #1 167 extr $var1, $var2, $var1, #1 168 extr $var2, $var3, $var2, #1 169 extr $var3, $var4, $var3, #1 170 lsr $var4, $var4, #1 171___ 172} 173 174# compilation by clang 10.0.0 with -O2/-O3 of 175# a[0] = (a[0] >> count) | (a[1] << (64-count)); 176# a[1] = (a[1] >> count) | (a[2] << (64-count)); 177# a[2] = (a[2] >> count) | (a[3] << (64-count)); 178# a[3] >>= count; 179# Note: EXTR instruction used in SHIFT1 is similar to x86_64's SHRDQ 180# except that the second source operand of EXTR is only immediate; 181# that's why it cannot be used here where $shift is a variable 182# 183# In the following, 184# t0 := 0 - shift 185# 186# then var0, for example, will be shifted right as follows: 187# var0 := (var0 >> (uint(shift) mod 64)) | (var1 << (uint(t0) mod 64)) 188# "uint() mod 64" is from the definition of LSL and LSR instructions. 189# 190# What matters here is the order of instructions relative to certain other 191# instructions, i.e. 192# - lsr and lsl must precede orr of the corresponding registers. 193# - lsl must preced the lsr of the same register afterwards. 194# The chosen order of the instructions overall is to try and maximize 195# the pipeline usage. 196sub SHIFT256 { 197 my ($var0, $var1, $var2, $var3) = @_; 198 return <<___; 199 neg $t0, $shift 200 lsr $var0, $var0, $shift 201 lsl $t1, $var1, $t0 202 203 lsr $var1, $var1, $shift 204 lsl $t2, $var2, $t0 205 206 orr $var0, $var0, $t1 207 208 lsr $var2, $var2, $shift 209 lsl $t3, $var3, $t0 210 211 orr $var1, $var1, $t2 212 213 lsr $var3, $var3, $shift 214 215 orr $var2, $var2, $t3 216___ 217} 218 219$code.=<<___; 220#include "openssl/arm_arch.h" 221 222.text 223.globl beeu_mod_inverse_vartime 224.type beeu_mod_inverse_vartime, %function 225.align 4 226beeu_mod_inverse_vartime: 227 // Reserve enough space for 14 8-byte registers on the stack 228 // in the first stp call for x29, x30. 229 // Then store the remaining callee-saved registers. 230 // 231 // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | 232 // ^ ^ 233 // sp <------------------- 112 bytes ----------------> old sp 234 // x29 (FP) 235 // 236 AARCH64_SIGN_LINK_REGISTER 237 stp x29,x30,[sp,#-112]! 238 add x29,sp,#0 239 stp x19,x20,[sp,#16] 240 stp x21,x22,[sp,#32] 241 stp x23,x24,[sp,#48] 242 stp x25,x26,[sp,#64] 243 stp x27,x28,[sp,#80] 244 stp x0,x2,[sp,#96] 245 246 // B = b3..b0 := a 247 ldp $b0,$b1,[$a_in] 248 ldp $b2,$b3,[$a_in,#16] 249 250 // n3..n0 := n 251 // Note: the value of input params are changed in the following. 252 ldp $n0,$n1,[$n_in] 253 ldp $n2,$n3,[$n_in,#16] 254 255 // A = a3..a0 := n 256 mov $a0, $n0 257 mov $a1, $n1 258 mov $a2, $n2 259 mov $a3, $n3 260 261 // X = x4..x0 := 1 262 mov $x0, #1 263 eor $x1, $x1, $x1 264 eor $x2, $x2, $x2 265 eor $x3, $x3, $x3 266 eor $x4, $x4, $x4 267 268 // Y = y4..y0 := 0 269 eor $y0, $y0, $y0 270 eor $y1, $y1, $y1 271 eor $y2, $y2, $y2 272 eor $y3, $y3, $y3 273 eor $y4, $y4, $y4 274 275.Lbeeu_loop: 276 // if B == 0, jump to .Lbeeu_loop_end 277 ${\TEST_B_ZERO} 278 279 // 0 < B < |n|, 280 // 0 < A <= |n|, 281 // (1) X*a == B (mod |n|), 282 // (2) (-1)*Y*a == A (mod |n|) 283 284 // Now divide B by the maximum possible power of two in the 285 // integers, and divide X by the same value mod |n|. 286 // When we're done, (1) still holds. 287 288 // shift := number of trailing 0s in $b0 289 // ( = number of leading 0s in $t1; see the "rbit" instruction in TEST_B_ZERO) 290 clz $shift, $t1 291 292 // If there is no shift, goto shift_A_Y 293 cbz $shift, .Lbeeu_shift_A_Y 294 295 // Shift B right by "$shift" bits 296 ${\SHIFT256($b0, $b1, $b2, $b3)} 297 298 // Shift X right by "$shift" bits, adding n whenever X becomes odd. 299 // $shift--; 300 // $t0 := 0; needed in the addition to the most significant word in SHIFT1 301 eor $t0, $t0, $t0 302.Lbeeu_shift_loop_X: 303 ${\SHIFT1($x0, $x1, $x2, $x3, $x4)} 304 subs $shift, $shift, #1 305 bne .Lbeeu_shift_loop_X 306 307 // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl 308 // with the following differences: 309 // - "$shift" is set directly to the number of trailing 0s in B 310 // (using rbit and clz instructions) 311 // - The loop is only used to call SHIFT1(X) 312 // and $shift is decreased while executing the X loop. 313 // - SHIFT256(B, $shift) is performed before right-shifting X; they are independent 314 315.Lbeeu_shift_A_Y: 316 // Same for A and Y. 317 // Afterwards, (2) still holds. 318 // Reverse the bit order of $a0 319 // $shift := number of trailing 0s in $a0 (= number of leading 0s in $t1) 320 rbit $t1, $a0 321 clz $shift, $t1 322 323 // If there is no shift, goto |B-A|, X+Y update 324 cbz $shift, .Lbeeu_update_B_X_or_A_Y 325 326 // Shift A right by "$shift" bits 327 ${\SHIFT256($a0, $a1, $a2, $a3)} 328 329 // Shift Y right by "$shift" bits, adding n whenever Y becomes odd. 330 // $shift--; 331 // $t0 := 0; needed in the addition to the most significant word in SHIFT1 332 eor $t0, $t0, $t0 333.Lbeeu_shift_loop_Y: 334 ${\SHIFT1($y0, $y1, $y2, $y3, $y4)} 335 subs $shift, $shift, #1 336 bne .Lbeeu_shift_loop_Y 337 338.Lbeeu_update_B_X_or_A_Y: 339 // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) 340 // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words 341 // without taking a sign bit if generated. The lack of a carry would 342 // indicate a negative result. See, for example, 343 // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes 344 subs $t0, $b0, $a0 345 sbcs $t1, $b1, $a1 346 sbcs $t2, $b2, $a2 347 sbcs $t3, $b3, $a3 348 bcs .Lbeeu_B_greater_than_A 349 350 // Else A > B => 351 // A := A - B; Y := Y + X; goto beginning of the loop 352 subs $a0, $a0, $b0 353 sbcs $a1, $a1, $b1 354 sbcs $a2, $a2, $b2 355 sbcs $a3, $a3, $b3 356 357 adds $y0, $y0, $x0 358 adcs $y1, $y1, $x1 359 adcs $y2, $y2, $x2 360 adcs $y3, $y3, $x3 361 adc $y4, $y4, $x4 362 b .Lbeeu_loop 363 364.Lbeeu_B_greater_than_A: 365 // Continue with B > A => 366 // B := B - A; X := X + Y; goto beginning of the loop 367 mov $b0, $t0 368 mov $b1, $t1 369 mov $b2, $t2 370 mov $b3, $t3 371 372 adds $x0, $x0, $y0 373 adcs $x1, $x1, $y1 374 adcs $x2, $x2, $y2 375 adcs $x3, $x3, $y3 376 adc $x4, $x4, $y4 377 b .Lbeeu_loop 378 379.Lbeeu_loop_end: 380 // The Euclid's algorithm loop ends when A == gcd(a,n); 381 // this would be 1, when a and n are co-prime (i.e. do not have a common factor). 382 // Since (-1)*Y*a == A (mod |n|), Y>0 383 // then out = -Y mod n 384 385 // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) 386 // Is A-1 == 0? 387 // If not, fail. 388 sub $t0, $a0, #1 389 orr $t0, $t0, $a1 390 orr $t0, $t0, $a2 391 orr $t0, $t0, $a3 392 cbnz $t0, .Lbeeu_err 393 394 // If Y>n ==> Y:=Y-n 395.Lbeeu_reduction_loop: 396 // x_i := y_i - n_i (X is no longer needed, use it as temp) 397 // ($t0 = 0 from above) 398 subs $x0, $y0, $n0 399 sbcs $x1, $y1, $n1 400 sbcs $x2, $y2, $n2 401 sbcs $x3, $y3, $n3 402 sbcs $x4, $y4, $t0 403 404 // If result is non-negative (i.e., cs = carry set = no borrow), 405 // y_i := x_i; goto reduce again 406 // else 407 // y_i := y_i; continue 408 csel $y0, $x0, $y0, cs 409 csel $y1, $x1, $y1, cs 410 csel $y2, $x2, $y2, cs 411 csel $y3, $x3, $y3, cs 412 csel $y4, $x4, $y4, cs 413 bcs .Lbeeu_reduction_loop 414 415 // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) 416 // out = -Y = n-Y 417 subs $y0, $n0, $y0 418 sbcs $y1, $n1, $y1 419 sbcs $y2, $n2, $y2 420 sbcs $y3, $n3, $y3 421 422 // Save Y in output (out (x0) was saved on the stack) 423 ldr x3, [sp,#96] 424 stp $y0, $y1, [x3] 425 stp $y2, $y3, [x3,#16] 426 // return 1 (success) 427 mov x0, #1 428 b .Lbeeu_finish 429 430.Lbeeu_err: 431 // return 0 (error) 432 eor x0, x0, x0 433 434.Lbeeu_finish: 435 // Restore callee-saved registers, except x0, x2 436 add sp,x29,#0 437 ldp x19,x20,[sp,#16] 438 ldp x21,x22,[sp,#32] 439 ldp x23,x24,[sp,#48] 440 ldp x25,x26,[sp,#64] 441 ldp x27,x28,[sp,#80] 442 ldp x29,x30,[sp],#112 443 444 AARCH64_VALIDATE_LINK_REGISTER 445 ret 446.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime 447___ 448 449 450foreach (split("\n",$code)) { 451 s/\`([^\`]*)\`/eval $1/ge; 452 453 print $_,"\n"; 454} 455close STDOUT or die "error closing STDOUT: $!"; # enforce flush 456