• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
2#
3# Permission to use, copy, modify, and/or distribute this software for any
4# purpose with or without fee is hereby granted, provided that the above
5# copyright notice and this permission notice appear in all copies.
6#
7# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14#
15#
16# This code is based on p256_beeu-x86_64-asm.pl (which is based on BN_mod_inverse_odd).
17#
18
19# The first two arguments should always be the flavour and output file path.
20if ($#ARGV < 1) { die "Not enough arguments provided.
21  Two arguments are necessary: the flavour and the output file path."; }
22
23$flavour = shift;
24$output  = shift;
25
26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
28( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
29die "can't locate arm-xlate.pl";
30
31open OUT,"| \"$^X\" $xlate $flavour $output";
32*STDOUT=*OUT;
33#############################################################################
34# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
35#                                     BN_ULONG a[P256_LIMBS],
36#                                     BN_ULONG n[P256_LIMBS]);
37#
38# (Binary Extended GCD (Euclidean) Algorithm.
39#  See A. Menezes, P. vanOorschot, and S. Vanstone's Handbook of Applied Cryptography,
40#  Chapter 14, Algorithm 14.61 and Note 14.64
41#  http://cacr.uwaterloo.ca/hac/about/chap14.pdf)
42
43# Assumption 1: n is odd for the BEEU
44# Assumption 2: 1 < a < n < 2^256
45
46# Details
47# The inverse of x modulo y can be calculated using Alg. 14.61, where "a" would be that inverse.
48# In other words,
49# ax == 1 (mod y) (where the symbol “==“ denotes ”congruent“)
50#  a == x^{-1} (mod y)
51#
52# It can be shown that throughout all the iterations of the algorithm, the following holds:
53#    u = Ax + By
54#    v = Cx + Dy
55# The values B and D are not of interest in this case, so they need not be computed by the algorithm.
56# This means the following congruences hold through the iterations of the algorithm.
57#    Ax == u (mod y)
58#    Cx == v (mod y)
59
60# Now we will modify the notation to match that of BN_mod_inverse_odd()
61# on which beeu_mod_inverse_vartime() in `p256_beeu-x86_64-asm` is based.
62# In those functions:
63#    x, y -> a, n
64#    u, v -> B, A
65#    A, C -> X, Y’, where Y’ = -Y
66# Hence, the following holds throughout the algorithm iterations
67#    Xa == B (mod n)
68#   -Ya == A (mod n)
69#
70# Same algorithm in Python:
71# def beeu(a, n):
72#     X = 1
73#     Y = 0
74#     B = a
75#     A = n
76#     while (B != 0):
77#         while (B % 2) == 0:
78#             B >>= 1
79#             if (X % 2) == 1:
80#                 X = X + n
81#             X >>= 1
82#         while (A % 2) == 0:
83#             A >>= 1
84#             if (Y % 2) == 1:
85#                 Y = Y + n
86#             Y >>= 1
87#         if (B >= A):
88#             B = B - A
89#             X = X + Y
90#         else:
91#             A = A - B
92#             Y = Y + X
93#     if (A != 1):
94#         # error
95#         return 0
96#     else:
97#         while (Y > n):
98#             Y = Y - n
99#         Y = n - Y
100#         return Y
101
102
103# For the internal variables,
104# x0-x2, x30 are used to hold the modulus n. The input parameters passed in
105# x1,x2 are copied first before corrupting them. x0 (out) is stored on the stack.
106# x3-x7 are used for parameters, which is not the case in this function, so they are corruptible
107# x8 is corruptible here
108# (the function doesn't return a struct, hence x8 doesn't contain a passed-in address
109#  for that struct).
110# x9-x15 are corruptible registers
111# x19-x28 are callee-saved registers
112
113# X/Y will hold the inverse parameter
114# Assumption: a,n,X,Y < 2^(256)
115# Initially, X := 1, Y := 0
116#            A := n, B := a
117
118# Function parameters (as per the Procedure Call Standard)
119my($out, $a_in, $n_in)=map("x$_",(0..2));
120# Internal variables
121my($n0, $n1, $n2, $n3)=map("x$_",(0..2,30));
122my($x0, $x1, $x2, $x3, $x4)=map("x$_",(3..7));
123my($y0, $y1, $y2, $y3, $y4)=map("x$_",(8..12));
124my($shift)=("x13");
125my($t0, $t1, $t2, $t3)=map("x$_",(14,15,19,20));
126my($a0, $a1, $a2, $a3)=map("x$_",(21..24));
127my($b0, $b1, $b2, $b3)=map("x$_",(25..28));
128
129# if B == 0, jump to end of loop
130sub TEST_B_ZERO {
131  return <<___;
132    orr     $t0, $b0, $b1
133    orr     $t0, $t0, $b2
134
135    // reverse the bit order of $b0. This is needed for clz after this macro
136    rbit     $t1, $b0
137
138    orr     $t0, $t0, $b3
139    cbz     $t0,.Lbeeu_loop_end
140___
141}
142
143# Shift right by 1 bit, adding the modulus first if the variable is odd
144# if least_sig_bit(var0) == 0,
145#     goto shift1_<ctr>
146# else
147#     add n and goto shift1_<ctr>
148# Prerequisite: t0 = 0
149$g_next_label = 0;
150sub SHIFT1 {
151  my ($var0, $var1, $var2, $var3, $var4) = @_;
152  my $label = ".Lshift1_${g_next_label}";
153  $g_next_label++;
154  return <<___;
155    tbz     $var0, #0, $label
156    adds    $var0, $var0, $n0
157    adcs    $var1, $var1, $n1
158    adcs    $var2, $var2, $n2
159    adcs    $var3, $var3, $n3
160    adc     $var4, $var4, $t0
161$label:
162    // var0 := [var1|var0]<64..1>;
163    // i.e. concatenate var1 and var0,
164    //      extract bits <64..1> from the resulting 128-bit value
165    //      and put them in var0
166    extr    $var0, $var1, $var0, #1
167    extr    $var1, $var2, $var1, #1
168    extr    $var2, $var3, $var2, #1
169    extr    $var3, $var4, $var3, #1
170    lsr     $var4, $var4, #1
171___
172}
173
174# compilation by clang 10.0.0 with -O2/-O3 of
175#      a[0] = (a[0] >> count) | (a[1] << (64-count));
176#      a[1] = (a[1] >> count) | (a[2] << (64-count));
177#      a[2] = (a[2] >> count) | (a[3] << (64-count));
178#      a[3] >>= count;
179# Note: EXTR instruction used in SHIFT1 is similar to x86_64's SHRDQ
180# except that the second source operand of EXTR is only immediate;
181# that's why it cannot be used here where $shift is a variable
182#
183# In the following,
184# t0 := 0 - shift
185#
186# then var0, for example, will be shifted right as follows:
187# var0 := (var0 >> (uint(shift) mod 64)) | (var1 << (uint(t0) mod 64))
188# "uint() mod 64" is from the definition of LSL and LSR instructions.
189#
190# What matters here is the order of instructions relative to certain other
191# instructions, i.e.
192# - lsr and lsl must precede orr of the corresponding registers.
193# - lsl must preced the lsr of the same register afterwards.
194# The chosen order of the instructions overall is to try and maximize
195# the pipeline usage.
196sub SHIFT256 {
197  my ($var0, $var1, $var2, $var3) = @_;
198  return <<___;
199    neg $t0, $shift
200    lsr $var0, $var0, $shift
201    lsl $t1, $var1, $t0
202
203    lsr $var1, $var1, $shift
204    lsl $t2, $var2, $t0
205
206    orr $var0, $var0, $t1
207
208    lsr $var2, $var2, $shift
209    lsl $t3, $var3, $t0
210
211    orr $var1, $var1, $t2
212
213    lsr $var3, $var3, $shift
214
215    orr $var2, $var2, $t3
216___
217}
218
219$code.=<<___;
220#include "openssl/arm_arch.h"
221
222.text
223.globl  beeu_mod_inverse_vartime
224.type   beeu_mod_inverse_vartime, %function
225.align  4
226beeu_mod_inverse_vartime:
227    // Reserve enough space for 14 8-byte registers on the stack
228    // in the first stp call for x29, x30.
229    // Then store the remaining callee-saved registers.
230    //
231    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
232    //    ^                                                     ^
233    //    sp  <------------------- 112 bytes ----------------> old sp
234    //   x29 (FP)
235    //
236    AARCH64_SIGN_LINK_REGISTER
237    stp     x29,x30,[sp,#-112]!
238    add     x29,sp,#0
239    stp     x19,x20,[sp,#16]
240    stp     x21,x22,[sp,#32]
241    stp     x23,x24,[sp,#48]
242    stp     x25,x26,[sp,#64]
243    stp     x27,x28,[sp,#80]
244    stp     x0,x2,[sp,#96]
245
246    // B = b3..b0 := a
247    ldp     $b0,$b1,[$a_in]
248    ldp     $b2,$b3,[$a_in,#16]
249
250    // n3..n0 := n
251    // Note: the value of input params are changed in the following.
252    ldp     $n0,$n1,[$n_in]
253    ldp     $n2,$n3,[$n_in,#16]
254
255    // A = a3..a0 := n
256    mov     $a0, $n0
257    mov     $a1, $n1
258    mov     $a2, $n2
259    mov     $a3, $n3
260
261    // X = x4..x0 := 1
262    mov     $x0, #1
263    eor     $x1, $x1, $x1
264    eor     $x2, $x2, $x2
265    eor     $x3, $x3, $x3
266    eor     $x4, $x4, $x4
267
268    // Y = y4..y0 := 0
269    eor     $y0, $y0, $y0
270    eor     $y1, $y1, $y1
271    eor     $y2, $y2, $y2
272    eor     $y3, $y3, $y3
273    eor     $y4, $y4, $y4
274
275.Lbeeu_loop:
276    // if B == 0, jump to .Lbeeu_loop_end
277    ${\TEST_B_ZERO}
278
279    // 0 < B < |n|,
280    // 0 < A <= |n|,
281    // (1)      X*a  ==  B   (mod |n|),
282    // (2) (-1)*Y*a  ==  A   (mod |n|)
283
284    // Now divide B by the maximum possible power of two in the
285    // integers, and divide X by the same value mod |n|.
286    // When we're done, (1) still holds.
287
288    // shift := number of trailing 0s in $b0
289    // (      = number of leading 0s in $t1; see the "rbit" instruction in TEST_B_ZERO)
290    clz     $shift, $t1
291
292    // If there is no shift, goto shift_A_Y
293    cbz     $shift, .Lbeeu_shift_A_Y
294
295    // Shift B right by "$shift" bits
296    ${\SHIFT256($b0, $b1, $b2, $b3)}
297
298    // Shift X right by "$shift" bits, adding n whenever X becomes odd.
299    // $shift--;
300    // $t0 := 0; needed in the addition to the most significant word in SHIFT1
301    eor     $t0, $t0, $t0
302.Lbeeu_shift_loop_X:
303    ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
304    subs    $shift, $shift, #1
305    bne     .Lbeeu_shift_loop_X
306
307    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
308    // with the following differences:
309    // - "$shift" is set directly to the number of trailing 0s in B
310    //   (using rbit and clz instructions)
311    // - The loop is only used to call SHIFT1(X)
312    //   and $shift is decreased while executing the X loop.
313    // - SHIFT256(B, $shift) is performed before right-shifting X; they are independent
314
315.Lbeeu_shift_A_Y:
316    // Same for A and Y.
317    // Afterwards, (2) still holds.
318    // Reverse the bit order of $a0
319    // $shift := number of trailing 0s in $a0 (= number of leading 0s in $t1)
320    rbit    $t1, $a0
321    clz     $shift, $t1
322
323    // If there is no shift, goto |B-A|, X+Y update
324    cbz     $shift, .Lbeeu_update_B_X_or_A_Y
325
326    // Shift A right by "$shift" bits
327    ${\SHIFT256($a0, $a1, $a2, $a3)}
328
329    // Shift Y right by "$shift" bits, adding n whenever Y becomes odd.
330    // $shift--;
331    // $t0 := 0; needed in the addition to the most significant word in SHIFT1
332    eor     $t0, $t0, $t0
333.Lbeeu_shift_loop_Y:
334    ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
335    subs    $shift, $shift, #1
336    bne     .Lbeeu_shift_loop_Y
337
338.Lbeeu_update_B_X_or_A_Y:
339    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
340    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
341    //       without taking a sign bit if generated. The lack of a carry would
342    //       indicate a negative result. See, for example,
343    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
344    subs    $t0, $b0, $a0
345    sbcs    $t1, $b1, $a1
346    sbcs    $t2, $b2, $a2
347    sbcs    $t3, $b3, $a3
348    bcs     .Lbeeu_B_greater_than_A
349
350    // Else A > B =>
351    // A := A - B; Y := Y + X; goto beginning of the loop
352    subs    $a0, $a0, $b0
353    sbcs    $a1, $a1, $b1
354    sbcs    $a2, $a2, $b2
355    sbcs    $a3, $a3, $b3
356
357    adds    $y0, $y0, $x0
358    adcs    $y1, $y1, $x1
359    adcs    $y2, $y2, $x2
360    adcs    $y3, $y3, $x3
361    adc     $y4, $y4, $x4
362    b       .Lbeeu_loop
363
364.Lbeeu_B_greater_than_A:
365    // Continue with B > A =>
366    // B := B - A; X := X + Y; goto beginning of the loop
367    mov     $b0, $t0
368    mov     $b1, $t1
369    mov     $b2, $t2
370    mov     $b3, $t3
371
372    adds    $x0, $x0, $y0
373    adcs    $x1, $x1, $y1
374    adcs    $x2, $x2, $y2
375    adcs    $x3, $x3, $y3
376    adc     $x4, $x4, $y4
377    b       .Lbeeu_loop
378
379.Lbeeu_loop_end:
380    // The Euclid's algorithm loop ends when A == gcd(a,n);
381    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
382    // Since (-1)*Y*a == A (mod |n|), Y>0
383    // then out = -Y mod n
384
385    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
386    // Is A-1 == 0?
387    // If not, fail.
388    sub     $t0, $a0, #1
389    orr     $t0, $t0, $a1
390    orr     $t0, $t0, $a2
391    orr     $t0, $t0, $a3
392    cbnz    $t0, .Lbeeu_err
393
394    // If Y>n ==> Y:=Y-n
395.Lbeeu_reduction_loop:
396    // x_i := y_i - n_i (X is no longer needed, use it as temp)
397    // ($t0 = 0 from above)
398    subs    $x0, $y0, $n0
399    sbcs    $x1, $y1, $n1
400    sbcs    $x2, $y2, $n2
401    sbcs    $x3, $y3, $n3
402    sbcs    $x4, $y4, $t0
403
404    // If result is non-negative (i.e., cs = carry set = no borrow),
405    // y_i := x_i; goto reduce again
406    // else
407    // y_i := y_i; continue
408    csel    $y0, $x0, $y0, cs
409    csel    $y1, $x1, $y1, cs
410    csel    $y2, $x2, $y2, cs
411    csel    $y3, $x3, $y3, cs
412    csel    $y4, $x4, $y4, cs
413    bcs     .Lbeeu_reduction_loop
414
415    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
416    // out = -Y = n-Y
417    subs    $y0, $n0, $y0
418    sbcs    $y1, $n1, $y1
419    sbcs    $y2, $n2, $y2
420    sbcs    $y3, $n3, $y3
421
422    // Save Y in output (out (x0) was saved on the stack)
423    ldr     x3, [sp,#96]
424    stp     $y0, $y1, [x3]
425    stp     $y2, $y3, [x3,#16]
426    // return 1 (success)
427    mov     x0, #1
428    b       .Lbeeu_finish
429
430.Lbeeu_err:
431    // return 0 (error)
432    eor     x0, x0, x0
433
434.Lbeeu_finish:
435    // Restore callee-saved registers, except x0, x2
436    add     sp,x29,#0
437    ldp     x19,x20,[sp,#16]
438    ldp     x21,x22,[sp,#32]
439    ldp     x23,x24,[sp,#48]
440    ldp     x25,x26,[sp,#64]
441    ldp     x27,x28,[sp,#80]
442    ldp     x29,x30,[sp],#112
443
444    AARCH64_VALIDATE_LINK_REGISTER
445    ret
446.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
447___
448
449
450foreach (split("\n",$code)) {
451    s/\`([^\`]*)\`/eval $1/ge;
452
453    print $_,"\n";
454}
455close STDOUT or die "error closing STDOUT: $!"; # enforce flush
456