1# Copyright (c) 2018, Amazon Inc. 2# 3# Permission to use, copy, modify, and/or distribute this software for any 4# purpose with or without fee is hereby granted, provided that the above 5# copyright notice and this permission notice appear in all copies. 6# 7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14# 15# Written by Nir Drucker, and Shay Gueron 16# AWS Cryptographic Algorithms Group 17# (ndrucker@amazon.com, gueron@amazon.com) 18# based on BN_mod_inverse_odd 19 20$flavour = shift; 21$output = shift; 22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23 24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25 26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29die "can't locate x86_64-xlate.pl"; 30 31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 32*STDOUT=*OUT; 33 34############################################################################# 35# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS], 36# BN_ULONG a[P256_LIMBS], 37# BN_ULONG n[P256_LIMBS]); 38# 39# (Binary Extended Euclidean Algorithm. 40# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm) 41# 42# Assumption 1: n is odd for the BEEU 43# Assumption 2: 1 < a < n < 2^256 44 45$out = "%rdi"; 46$a = "%rsi"; 47$n = "%rdx"; 48 49# X/Y will hold the inverse parameter 50# Assumption: X,Y<2^(256) 51$x0 = "%r8"; 52$x1 = "%r9"; 53$x2 = "%r10"; 54$x3 = "%r11"; 55# borrow from out (out is needed only at the end) 56$x4 = "%rdi"; 57$y0 = "%r12"; 58$y1 = "%r13"; 59$y2 = "%r14"; 60$y3 = "%r15"; 61$y4 = "%rbp"; 62$shift = "%rcx"; 63$t0 = "%rax"; 64$t1 = "%rbx"; 65$t2 = "%rsi"; 66# borrow 67$t3 = "%rcx"; 68 69$T0 = "%xmm0"; 70$T1 = "%xmm1"; 71 72# Offsets on the stack 73$out_rsp = 0; 74$shift_rsp = $out_rsp+0x8; 75$a_rsp0 = $shift_rsp+0x8; 76$a_rsp1 = $a_rsp0+0x8; 77$a_rsp2 = $a_rsp1+0x8; 78$a_rsp3 = $a_rsp2+0x8; 79$b_rsp0 = $a_rsp3+0x8; 80$b_rsp1 = $b_rsp0+0x8; 81$b_rsp2 = $b_rsp1+0x8; 82$b_rsp3 = $b_rsp2+0x8; 83 84# Borrow when a_rsp/b_rsp are no longer needed. 85$y_rsp0 = $a_rsp0; 86$y_rsp1 = $y_rsp0+0x8; 87$y_rsp2 = $y_rsp1+0x8; 88$y_rsp3 = $y_rsp2+0x8; 89$y_rsp4 = $y_rsp3+0x8; 90$last_rsp_offset = $b_rsp3+0x8; 91 92sub TEST_B_ZERO { 93 return <<___; 94 xorq $t1, $t1 95 or $b_rsp0(%rsp), $t1 96 or $b_rsp1(%rsp), $t1 97 or $b_rsp2(%rsp), $t1 98 or $b_rsp3(%rsp), $t1 99 jz .Lbeeu_loop_end 100___ 101} 102 103$g_next_label = 0; 104 105sub SHIFT1 { 106 my ($var0, $var1, $var2, $var3, $var4) = @_; 107 my $label = ".Lshift1_${g_next_label}"; 108 $g_next_label++; 109 110 return <<___; 111 # Ensure X is even and divide by two. 112 movq \$1, $t1 113 andq $var0, $t1 114 jz $label 115 add 0*8($n), $var0 116 adc 1*8($n), $var1 117 adc 2*8($n), $var2 118 adc 3*8($n), $var3 119 adc \$0, $var4 120 121$label: 122 shrdq \$1, $var1, $var0 123 shrdq \$1, $var2, $var1 124 shrdq \$1, $var3, $var2 125 shrdq \$1, $var4, $var3 126 shrq \$1, $var4 127___ 128} 129 130sub SHIFT256 { 131 my ($var) = @_; 132 return <<___; 133 # Copy shifted values. 134 # Remember not to override t3=rcx 135 movq 1*8+$var(%rsp), $t0 136 movq 2*8+$var(%rsp), $t1 137 movq 3*8+$var(%rsp), $t2 138 139 shrdq %cl, $t0, 0*8+$var(%rsp) 140 shrdq %cl, $t1, 1*8+$var(%rsp) 141 shrdq %cl, $t2, 2*8+$var(%rsp) 142 143 shrq %cl, $t2 144 mov $t2, 3*8+$var(%rsp) 145___ 146} 147 148$code.=<<___; 149.text 150 151.type beeu_mod_inverse_vartime,\@function 152.hidden beeu_mod_inverse_vartime 153.globl beeu_mod_inverse_vartime 154.align 32 155beeu_mod_inverse_vartime: 156.cfi_startproc 157 push %rbp 158.cfi_push rbp 159 push %r12 160.cfi_push r12 161 push %r13 162.cfi_push r13 163 push %r14 164.cfi_push r14 165 push %r15 166.cfi_push r15 167 push %rbx 168.cfi_push rbx 169 push %rsi 170.cfi_push rsi 171 172 sub \$$last_rsp_offset, %rsp 173.cfi_adjust_cfa_offset $last_rsp_offset 174 movq $out, $out_rsp(%rsp) 175 176 # X=1, Y=0 177 movq \$1, $x0 178 xorq $x1, $x1 179 xorq $x2, $x2 180 xorq $x3, $x3 181 xorq $x4, $x4 182 183 xorq $y0, $y0 184 xorq $y1, $y1 185 xorq $y2, $y2 186 xorq $y3, $y3 187 xorq $y4, $y4 188 189 # Copy a/n into B/A on the stack. 190 vmovdqu 0*8($a), $T0 191 vmovdqu 2*8($a), $T1 192 vmovdqu $T0, $b_rsp0(%rsp) 193 vmovdqu $T1, $b_rsp2(%rsp) 194 195 vmovdqu 0*8($n), $T0 196 vmovdqu 2*8($n), $T1 197 vmovdqu $T0, $a_rsp0(%rsp) 198 vmovdqu $T1, $a_rsp2(%rsp) 199 200.Lbeeu_loop: 201 ${\TEST_B_ZERO} 202 203 # 0 < B < |n|, 204 # 0 < A <= |n|, 205 # (1) X*a == B (mod |n|), 206 # (2) (-1)*Y*a == A (mod |n|) 207 208 # Now divide B by the maximum possible power of two in the 209 # integers, and divide X by the same value mod |n|. When we're 210 # done, (1) still holds. 211 movq \$1, $shift 212 213 # Note that B > 0 214.Lbeeu_shift_loop_XB: 215 movq $shift, $t1 216 andq $b_rsp0(%rsp), $t1 217 jnz .Lbeeu_shift_loop_end_XB 218 219 ${\SHIFT1($x0, $x1, $x2, $x3, $x4)} 220 shl \$1, $shift 221 222 # Test wraparound of the shift parameter. The probability to have 32 zeroes 223 # in a row is small Therefore having the value below equal \$0x8000000 or 224 # \$0x8000 does not affect the performance. We choose 0x8000000 because it 225 # is the maximal immediate value possible. 226 cmp \$0x8000000, $shift 227 jne .Lbeeu_shift_loop_XB 228 229.Lbeeu_shift_loop_end_XB: 230 bsf $shift, $shift 231 test $shift, $shift 232 jz .Lbeeu_no_shift_XB 233 234 ${\SHIFT256($b_rsp0)} 235 236.Lbeeu_no_shift_XB: 237 # Same for A and Y. Afterwards, (2) still holds. 238 movq \$1, $shift 239 240 # Note that A > 0 241.Lbeeu_shift_loop_YA: 242 movq $shift, $t1 243 andq $a_rsp0(%rsp), $t1 244 jnz .Lbeeu_shift_loop_end_YA 245 246 ${\SHIFT1($y0, $y1, $y2, $y3, $y4)} 247 shl \$1, $shift 248 249 # Test wraparound of the shift parameter. The probability to have 32 zeroes 250 # in a row is small therefore having the value below equal \$0x8000000 or 251 # \$0x8000 Does not affect the performance. We choose 0x8000000 because it 252 # is the maximal immediate value possible. 253 cmp \$0x8000000, $shift 254 jne .Lbeeu_shift_loop_YA 255 256.Lbeeu_shift_loop_end_YA: 257 bsf $shift, $shift 258 test $shift, $shift 259 jz .Lbeeu_no_shift_YA 260 261 ${\SHIFT256($a_rsp0)} 262 263.Lbeeu_no_shift_YA: 264 # T = B-A (A,B < 2^256) 265 mov $b_rsp0(%rsp), $t0 266 mov $b_rsp1(%rsp), $t1 267 mov $b_rsp2(%rsp), $t2 268 mov $b_rsp3(%rsp), $t3 269 sub $a_rsp0(%rsp), $t0 270 sbb $a_rsp1(%rsp), $t1 271 sbb $a_rsp2(%rsp), $t2 272 sbb $a_rsp3(%rsp), $t3 # borrow from shift 273 jnc .Lbeeu_B_bigger_than_A 274 275 # A = A - B 276 mov $a_rsp0(%rsp), $t0 277 mov $a_rsp1(%rsp), $t1 278 mov $a_rsp2(%rsp), $t2 279 mov $a_rsp3(%rsp), $t3 280 sub $b_rsp0(%rsp), $t0 281 sbb $b_rsp1(%rsp), $t1 282 sbb $b_rsp2(%rsp), $t2 283 sbb $b_rsp3(%rsp), $t3 284 mov $t0, $a_rsp0(%rsp) 285 mov $t1, $a_rsp1(%rsp) 286 mov $t2, $a_rsp2(%rsp) 287 mov $t3, $a_rsp3(%rsp) 288 289 # Y = Y + X 290 add $x0, $y0 291 adc $x1, $y1 292 adc $x2, $y2 293 adc $x3, $y3 294 adc $x4, $y4 295 jmp .Lbeeu_loop 296 297.Lbeeu_B_bigger_than_A: 298 # B = T = B - A 299 mov $t0, $b_rsp0(%rsp) 300 mov $t1, $b_rsp1(%rsp) 301 mov $t2, $b_rsp2(%rsp) 302 mov $t3, $b_rsp3(%rsp) 303 304 # X = Y + X 305 add $y0, $x0 306 adc $y1, $x1 307 adc $y2, $x2 308 adc $y3, $x3 309 adc $y4, $x4 310 311 jmp .Lbeeu_loop 312 313.Lbeeu_loop_end: 314 # The Euclid's algorithm loop ends when A == beeu(a,n); 315 # Therefore (-1)*Y*a == A (mod |n|), Y>0 316 317 # Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) 318 mov $a_rsp0(%rsp), $t1 319 sub \$1, $t1 320 or $a_rsp1(%rsp), $t1 321 or $a_rsp2(%rsp), $t1 322 or $a_rsp3(%rsp), $t1 323 # If not, fail. 324 jnz .Lbeeu_err 325 326 # From this point on, we no longer need X 327 # Therefore we use it as a temporary storage. 328 # X = n 329 movq 0*8($n), $x0 330 movq 1*8($n), $x1 331 movq 2*8($n), $x2 332 movq 3*8($n), $x3 333 xorq $x4, $x4 334 335.Lbeeu_reduction_loop: 336 movq $y0, $y_rsp0(%rsp) 337 movq $y1, $y_rsp1(%rsp) 338 movq $y2, $y_rsp2(%rsp) 339 movq $y3, $y_rsp3(%rsp) 340 movq $y4, $y_rsp4(%rsp) 341 342 # If Y>n ==> Y=Y-n 343 sub $x0, $y0 344 sbb $x1, $y1 345 sbb $x2, $y2 346 sbb $x3, $y3 347 sbb \$0, $y4 348 349 # Choose old Y or new Y 350 cmovc $y_rsp0(%rsp), $y0 351 cmovc $y_rsp1(%rsp), $y1 352 cmovc $y_rsp2(%rsp), $y2 353 cmovc $y_rsp3(%rsp), $y3 354 jnc .Lbeeu_reduction_loop 355 356 # X = n - Y (n, Y < 2^256), (Cancel the (-1)) 357 sub $y0, $x0 358 sbb $y1, $x1 359 sbb $y2, $x2 360 sbb $y3, $x3 361 362.Lbeeu_save: 363 # Save the inverse(<2^256) to out. 364 mov $out_rsp(%rsp), $out 365 366 movq $x0, 0*8($out) 367 movq $x1, 1*8($out) 368 movq $x2, 2*8($out) 369 movq $x3, 3*8($out) 370 371 # Return 1. 372 movq \$1, %rax 373 jmp .Lbeeu_finish 374 375.Lbeeu_err: 376 # Return 0. 377 xorq %rax, %rax 378 379.Lbeeu_finish: 380 add \$$last_rsp_offset, %rsp 381.cfi_adjust_cfa_offset -$last_rsp_offset 382 pop %rsi 383.cfi_pop rsi 384 pop %rbx 385.cfi_pop rbx 386 pop %r15 387.cfi_pop r15 388 pop %r14 389.cfi_pop r14 390 pop %r13 391.cfi_pop r13 392 pop %r12 393.cfi_pop r12 394 pop %rbp 395.cfi_pop rbp 396 ret 397.cfi_endproc 398 399.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime 400___ 401 402print $code; 403close STDOUT; 404