1# Copyright (c) 2018, Amazon Inc. 2# 3# Permission to use, copy, modify, and/or distribute this software for any 4# purpose with or without fee is hereby granted, provided that the above 5# copyright notice and this permission notice appear in all copies. 6# 7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14# 15# Written by Nir Drucker, and Shay Gueron 16# AWS Cryptographic Algorithms Group 17# (ndrucker@amazon.com, gueron@amazon.com) 18# based on BN_mod_inverse_odd 19 20$flavour = shift; 21$output = shift; 22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23 24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25 26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29die "can't locate x86_64-xlate.pl"; 30 31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 32*STDOUT=*OUT; 33 34############################################################################# 35# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], 36# BN_ULONG a[P256_LIMBS], 37# BN_ULONG n[P256_LIMBS]); 38# 39# (Binary Extended Euclidean Algorithm. 40# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm) 41# 42# Assumption 1: n is odd for the BEEU 43# Assumption 2: 1 < a < n < 2^256 44 45$out = "%rdi"; 46$a = "%rsi"; 47$n = "%rdx"; 48 49# X/Y will hold the inverse parameter 50# Assumption: X,Y<2^(256) 51$x0 = "%r8"; 52$x1 = "%r9"; 53$x2 = "%r10"; 54$x3 = "%r11"; 55# borrow from out (out is needed only at the end) 56$x4 = "%rdi"; 57$y0 = "%r12"; 58$y1 = "%r13"; 59$y2 = "%r14"; 60$y3 = "%r15"; 61$y4 = "%rbp"; 62$shift = "%rcx"; 63$t0 = "%rax"; 64$t1 = "%rbx"; 65$t2 = "%rsi"; 66# borrow 67$t3 = "%rcx"; 68 69$T0 = "%xmm0"; 70$T1 = "%xmm1"; 71 72# Offsets on the stack 73$out_rsp = 0; 74$shift_rsp = $out_rsp+0x8; 75$a_rsp0 = $shift_rsp+0x8; 76$a_rsp1 = $a_rsp0+0x8; 77$a_rsp2 = $a_rsp1+0x8; 78$a_rsp3 = $a_rsp2+0x8; 79$b_rsp0 = $a_rsp3+0x8; 80$b_rsp1 = $b_rsp0+0x8; 81$b_rsp2 = $b_rsp1+0x8; 82$b_rsp3 = $b_rsp2+0x8; 83 84# Borrow when a_rsp/b_rsp are no longer needed. 85$y_rsp0 = $a_rsp0; 86$y_rsp1 = $y_rsp0+0x8; 87$y_rsp2 = $y_rsp1+0x8; 88$y_rsp3 = $y_rsp2+0x8; 89$y_rsp4 = $y_rsp3+0x8; 90$last_rsp_offset = $b_rsp3+0x8; 91 92sub TEST_B_ZERO { 93 return <<___; 94 xorq $t1, $t1 95 or $b_rsp0(%rsp), $t1 96 or $b_rsp1(%rsp), $t1 97 or $b_rsp2(%rsp), $t1 98 or $b_rsp3(%rsp), $t1 99 jz .Lbeeu_loop_end 100___ 101} 102 103$g_next_label = 0; 104 105sub SHIFT1 { 106 my ($var0, $var1, $var2, $var3, $var4) = @_; 107 my $label = ".Lshift1_${g_next_label}"; 108 $g_next_label++; 109 110 return <<___; 111 # Ensure X is even and divide by two. 112 movq \$1, $t1 113 andq $var0, $t1 114 jz $label 115 add 0*8($n), $var0 116 adc 1*8($n), $var1 117 adc 2*8($n), $var2 118 adc 3*8($n), $var3 119 adc \$0, $var4 120 121$label: 122 shrdq \$1, $var1, $var0 123 shrdq \$1, $var2, $var1 124 shrdq \$1, $var3, $var2 125 shrdq \$1, $var4, $var3 126 shrq \$1, $var4 127___ 128} 129 130sub SHIFT256 { 131 my ($var) = @_; 132 return <<___; 133 # Copy shifted values. 134 # Remember not to override t3=rcx 135 movq 1*8+$var(%rsp), $t0 136 movq 2*8+$var(%rsp), $t1 137 movq 3*8+$var(%rsp), $t2 138 139 shrdq %cl, $t0, 0*8+$var(%rsp) 140 shrdq %cl, $t1, 1*8+$var(%rsp) 141 shrdq %cl, $t2, 2*8+$var(%rsp) 142 143 shrq %cl, $t2 144 mov $t2, 3*8+$var(%rsp) 145___ 146} 147 148$code.=<<___; 149.text 150 151.type beeu_mod_inverse_vartime,\@function 152.hidden beeu_mod_inverse_vartime 153.globl beeu_mod_inverse_vartime 154.align 32 155beeu_mod_inverse_vartime: 156.cfi_startproc 157 _CET_ENDBR 158 push %rbp 159.cfi_push rbp 160 push %r12 161.cfi_push r12 162 push %r13 163.cfi_push r13 164 push %r14 165.cfi_push r14 166 push %r15 167.cfi_push r15 168 push %rbx 169.cfi_push rbx 170 push %rsi 171.cfi_push rsi 172 173 sub \$$last_rsp_offset, %rsp 174.cfi_adjust_cfa_offset $last_rsp_offset 175 movq $out, $out_rsp(%rsp) 176 177 # X=1, Y=0 178 movq \$1, $x0 179 xorq $x1, $x1 180 xorq $x2, $x2 181 xorq $x3, $x3 182 xorq $x4, $x4 183 184 xorq $y0, $y0 185 xorq $y1, $y1 186 xorq $y2, $y2 187 xorq $y3, $y3 188 xorq $y4, $y4 189 190 # Copy a/n into B/A on the stack. 191 vmovdqu 0*8($a), $T0 192 vmovdqu 2*8($a), $T1 193 vmovdqu $T0, $b_rsp0(%rsp) 194 vmovdqu $T1, $b_rsp2(%rsp) 195 196 vmovdqu 0*8($n), $T0 197 vmovdqu 2*8($n), $T1 198 vmovdqu $T0, $a_rsp0(%rsp) 199 vmovdqu $T1, $a_rsp2(%rsp) 200 201.Lbeeu_loop: 202 ${\TEST_B_ZERO} 203 204 # 0 < B < |n|, 205 # 0 < A <= |n|, 206 # (1) X*a == B (mod |n|), 207 # (2) (-1)*Y*a == A (mod |n|) 208 209 # Now divide B by the maximum possible power of two in the 210 # integers, and divide X by the same value mod |n|. When we're 211 # done, (1) still holds. 212 movq \$1, $shift 213 214 # Note that B > 0 215.Lbeeu_shift_loop_XB: 216 movq $shift, $t1 217 andq $b_rsp0(%rsp), $t1 218 jnz .Lbeeu_shift_loop_end_XB 219 220 ${\SHIFT1($x0, $x1, $x2, $x3, $x4)} 221 shl \$1, $shift 222 223 # Test wraparound of the shift parameter. The probability to have 32 zeroes 224 # in a row is small Therefore having the value below equal \$0x8000000 or 225 # \$0x8000 does not affect the performance. We choose 0x8000000 because it 226 # is the maximal immediate value possible. 227 cmp \$0x8000000, $shift 228 jne .Lbeeu_shift_loop_XB 229 230.Lbeeu_shift_loop_end_XB: 231 bsf $shift, $shift 232 test $shift, $shift 233 jz .Lbeeu_no_shift_XB 234 235 ${\SHIFT256($b_rsp0)} 236 237.Lbeeu_no_shift_XB: 238 # Same for A and Y. Afterwards, (2) still holds. 239 movq \$1, $shift 240 241 # Note that A > 0 242.Lbeeu_shift_loop_YA: 243 movq $shift, $t1 244 andq $a_rsp0(%rsp), $t1 245 jnz .Lbeeu_shift_loop_end_YA 246 247 ${\SHIFT1($y0, $y1, $y2, $y3, $y4)} 248 shl \$1, $shift 249 250 # Test wraparound of the shift parameter. The probability to have 32 zeroes 251 # in a row is small therefore having the value below equal \$0x8000000 or 252 # \$0x8000 Does not affect the performance. We choose 0x8000000 because it 253 # is the maximal immediate value possible. 254 cmp \$0x8000000, $shift 255 jne .Lbeeu_shift_loop_YA 256 257.Lbeeu_shift_loop_end_YA: 258 bsf $shift, $shift 259 test $shift, $shift 260 jz .Lbeeu_no_shift_YA 261 262 ${\SHIFT256($a_rsp0)} 263 264.Lbeeu_no_shift_YA: 265 # T = B-A (A,B < 2^256) 266 mov $b_rsp0(%rsp), $t0 267 mov $b_rsp1(%rsp), $t1 268 mov $b_rsp2(%rsp), $t2 269 mov $b_rsp3(%rsp), $t3 270 sub $a_rsp0(%rsp), $t0 271 sbb $a_rsp1(%rsp), $t1 272 sbb $a_rsp2(%rsp), $t2 273 sbb $a_rsp3(%rsp), $t3 # borrow from shift 274 jnc .Lbeeu_B_bigger_than_A 275 276 # A = A - B 277 mov $a_rsp0(%rsp), $t0 278 mov $a_rsp1(%rsp), $t1 279 mov $a_rsp2(%rsp), $t2 280 mov $a_rsp3(%rsp), $t3 281 sub $b_rsp0(%rsp), $t0 282 sbb $b_rsp1(%rsp), $t1 283 sbb $b_rsp2(%rsp), $t2 284 sbb $b_rsp3(%rsp), $t3 285 mov $t0, $a_rsp0(%rsp) 286 mov $t1, $a_rsp1(%rsp) 287 mov $t2, $a_rsp2(%rsp) 288 mov $t3, $a_rsp3(%rsp) 289 290 # Y = Y + X 291 add $x0, $y0 292 adc $x1, $y1 293 adc $x2, $y2 294 adc $x3, $y3 295 adc $x4, $y4 296 jmp .Lbeeu_loop 297 298.Lbeeu_B_bigger_than_A: 299 # B = T = B - A 300 mov $t0, $b_rsp0(%rsp) 301 mov $t1, $b_rsp1(%rsp) 302 mov $t2, $b_rsp2(%rsp) 303 mov $t3, $b_rsp3(%rsp) 304 305 # X = Y + X 306 add $y0, $x0 307 adc $y1, $x1 308 adc $y2, $x2 309 adc $y3, $x3 310 adc $y4, $x4 311 312 jmp .Lbeeu_loop 313 314.Lbeeu_loop_end: 315 # The Euclid's algorithm loop ends when A == beeu(a,n); 316 # Therefore (-1)*Y*a == A (mod |n|), Y>0 317 318 # Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) 319 mov $a_rsp0(%rsp), $t1 320 sub \$1, $t1 321 or $a_rsp1(%rsp), $t1 322 or $a_rsp2(%rsp), $t1 323 or $a_rsp3(%rsp), $t1 324 # If not, fail. 325 jnz .Lbeeu_err 326 327 # From this point on, we no longer need X 328 # Therefore we use it as a temporary storage. 329 # X = n 330 movq 0*8($n), $x0 331 movq 1*8($n), $x1 332 movq 2*8($n), $x2 333 movq 3*8($n), $x3 334 xorq $x4, $x4 335 336.Lbeeu_reduction_loop: 337 movq $y0, $y_rsp0(%rsp) 338 movq $y1, $y_rsp1(%rsp) 339 movq $y2, $y_rsp2(%rsp) 340 movq $y3, $y_rsp3(%rsp) 341 movq $y4, $y_rsp4(%rsp) 342 343 # If Y>n ==> Y=Y-n 344 sub $x0, $y0 345 sbb $x1, $y1 346 sbb $x2, $y2 347 sbb $x3, $y3 348 sbb \$0, $y4 349 350 # Choose old Y or new Y 351 cmovc $y_rsp0(%rsp), $y0 352 cmovc $y_rsp1(%rsp), $y1 353 cmovc $y_rsp2(%rsp), $y2 354 cmovc $y_rsp3(%rsp), $y3 355 jnc .Lbeeu_reduction_loop 356 357 # X = n - Y (n, Y < 2^256), (Cancel the (-1)) 358 sub $y0, $x0 359 sbb $y1, $x1 360 sbb $y2, $x2 361 sbb $y3, $x3 362 363.Lbeeu_save: 364 # Save the inverse(<2^256) to out. 365 mov $out_rsp(%rsp), $out 366 367 movq $x0, 0*8($out) 368 movq $x1, 1*8($out) 369 movq $x2, 2*8($out) 370 movq $x3, 3*8($out) 371 372 # Return 1. 373 movq \$1, %rax 374 jmp .Lbeeu_finish 375 376.Lbeeu_err: 377 # Return 0. 378 xorq %rax, %rax 379 380.Lbeeu_finish: 381 add \$$last_rsp_offset, %rsp 382.cfi_adjust_cfa_offset -$last_rsp_offset 383 pop %rsi 384.cfi_pop rsi 385 pop %rbx 386.cfi_pop rbx 387 pop %r15 388.cfi_pop r15 389 pop %r14 390.cfi_pop r14 391 pop %r13 392.cfi_pop r13 393 pop %r12 394.cfi_pop r12 395 pop %rbp 396.cfi_pop rbp 397 ret 398.cfi_endproc 399 400.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime 401___ 402 403print $code; 404close STDOUT or die "error closing STDOUT: $!"; 405