• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include "openssl/arm_arch.h"
17
18.text
19.globl	beeu_mod_inverse_vartime
20.hidden	beeu_mod_inverse_vartime
21.type	beeu_mod_inverse_vartime, %function
22.align	4
23beeu_mod_inverse_vartime:
24    // Reserve enough space for 14 8-byte registers on the stack
25    // in the first stp call for x29, x30.
26    // Then store the remaining callee-saved registers.
27    //
28    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
29    //    ^                                                     ^
30    //    sp  <------------------- 112 bytes ----------------> old sp
31    //   x29 (FP)
32    //
33	AARCH64_SIGN_LINK_REGISTER
34	stp	x29,x30,[sp,#-112]!
35	add	x29,sp,#0
36	stp	x19,x20,[sp,#16]
37	stp	x21,x22,[sp,#32]
38	stp	x23,x24,[sp,#48]
39	stp	x25,x26,[sp,#64]
40	stp	x27,x28,[sp,#80]
41	stp	x0,x2,[sp,#96]
42
43    // B = b3..b0 := a
44	ldp	x25,x26,[x1]
45	ldp	x27,x28,[x1,#16]
46
47    // n3..n0 := n
48    // Note: the value of input params are changed in the following.
49	ldp	x0,x1,[x2]
50	ldp	x2,x30,[x2,#16]
51
52    // A = a3..a0 := n
53	mov	x21, x0
54	mov	x22, x1
55	mov	x23, x2
56	mov	x24, x30
57
58    // X = x4..x0 := 1
59	mov	x3, #1
60	eor	x4, x4, x4
61	eor	x5, x5, x5
62	eor	x6, x6, x6
63	eor	x7, x7, x7
64
65    // Y = y4..y0 := 0
66	eor	x8, x8, x8
67	eor	x9, x9, x9
68	eor	x10, x10, x10
69	eor	x11, x11, x11
70	eor	x12, x12, x12
71
72.Lbeeu_loop:
73    // if B == 0, jump to .Lbeeu_loop_end
74	orr	x14, x25, x26
75	orr	x14, x14, x27
76
77    // reverse the bit order of x25. This is needed for clz after this macro
78	rbit	x15, x25
79
80	orr	x14, x14, x28
81	cbz	x14,.Lbeeu_loop_end
82
83
84    // 0 < B < |n|,
85    // 0 < A <= |n|,
86    // (1)      X*a  ==  B   (mod |n|),
87    // (2) (-1)*Y*a  ==  A   (mod |n|)
88
89    // Now divide B by the maximum possible power of two in the
90    // integers, and divide X by the same value mod |n|.
91    // When we're done, (1) still holds.
92
93    // shift := number of trailing 0s in x25
94    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
95	clz	x13, x15
96
97    // If there is no shift, goto shift_A_Y
98	cbz	x13, .Lbeeu_shift_A_Y
99
100    // Shift B right by "x13" bits
101	neg	x14, x13
102	lsr	x25, x25, x13
103	lsl	x15, x26, x14
104
105	lsr	x26, x26, x13
106	lsl	x19, x27, x14
107
108	orr	x25, x25, x15
109
110	lsr	x27, x27, x13
111	lsl	x20, x28, x14
112
113	orr	x26, x26, x19
114
115	lsr	x28, x28, x13
116
117	orr	x27, x27, x20
118
119
120    // Shift X right by "x13" bits, adding n whenever X becomes odd.
121    // x13--;
122    // x14 := 0; needed in the addition to the most significant word in SHIFT1
123	eor	x14, x14, x14
124.Lbeeu_shift_loop_X:
125	tbz	x3, #0, .Lshift1_0
126	adds	x3, x3, x0
127	adcs	x4, x4, x1
128	adcs	x5, x5, x2
129	adcs	x6, x6, x30
130	adc	x7, x7, x14
131.Lshift1_0:
132    // var0 := [var1|var0]<64..1>;
133    // i.e. concatenate var1 and var0,
134    //      extract bits <64..1> from the resulting 128-bit value
135    //      and put them in var0
136	extr	x3, x4, x3, #1
137	extr	x4, x5, x4, #1
138	extr	x5, x6, x5, #1
139	extr	x6, x7, x6, #1
140	lsr	x7, x7, #1
141
142	subs	x13, x13, #1
143	bne	.Lbeeu_shift_loop_X
144
145    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
146    // with the following differences:
147    // - "x13" is set directly to the number of trailing 0s in B
148    //   (using rbit and clz instructions)
149    // - The loop is only used to call SHIFT1(X)
150    //   and x13 is decreased while executing the X loop.
151    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
152
153.Lbeeu_shift_A_Y:
154    // Same for A and Y.
155    // Afterwards, (2) still holds.
156    // Reverse the bit order of x21
157    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
158	rbit	x15, x21
159	clz	x13, x15
160
161    // If there is no shift, goto |B-A|, X+Y update
162	cbz	x13, .Lbeeu_update_B_X_or_A_Y
163
164    // Shift A right by "x13" bits
165	neg	x14, x13
166	lsr	x21, x21, x13
167	lsl	x15, x22, x14
168
169	lsr	x22, x22, x13
170	lsl	x19, x23, x14
171
172	orr	x21, x21, x15
173
174	lsr	x23, x23, x13
175	lsl	x20, x24, x14
176
177	orr	x22, x22, x19
178
179	lsr	x24, x24, x13
180
181	orr	x23, x23, x20
182
183
184    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
185    // x13--;
186    // x14 := 0; needed in the addition to the most significant word in SHIFT1
187	eor	x14, x14, x14
188.Lbeeu_shift_loop_Y:
189	tbz	x8, #0, .Lshift1_1
190	adds	x8, x8, x0
191	adcs	x9, x9, x1
192	adcs	x10, x10, x2
193	adcs	x11, x11, x30
194	adc	x12, x12, x14
195.Lshift1_1:
196    // var0 := [var1|var0]<64..1>;
197    // i.e. concatenate var1 and var0,
198    //      extract bits <64..1> from the resulting 128-bit value
199    //      and put them in var0
200	extr	x8, x9, x8, #1
201	extr	x9, x10, x9, #1
202	extr	x10, x11, x10, #1
203	extr	x11, x12, x11, #1
204	lsr	x12, x12, #1
205
206	subs	x13, x13, #1
207	bne	.Lbeeu_shift_loop_Y
208
209.Lbeeu_update_B_X_or_A_Y:
210    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
211    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
212    //       without taking a sign bit if generated. The lack of a carry would
213    //       indicate a negative result. See, for example,
214    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
215	subs	x14, x25, x21
216	sbcs	x15, x26, x22
217	sbcs	x19, x27, x23
218	sbcs	x20, x28, x24
219	bcs	.Lbeeu_B_greater_than_A
220
221    // Else A > B =>
222    // A := A - B; Y := Y + X; goto beginning of the loop
223	subs	x21, x21, x25
224	sbcs	x22, x22, x26
225	sbcs	x23, x23, x27
226	sbcs	x24, x24, x28
227
228	adds	x8, x8, x3
229	adcs	x9, x9, x4
230	adcs	x10, x10, x5
231	adcs	x11, x11, x6
232	adc	x12, x12, x7
233	b	.Lbeeu_loop
234
235.Lbeeu_B_greater_than_A:
236    // Continue with B > A =>
237    // B := B - A; X := X + Y; goto beginning of the loop
238	mov	x25, x14
239	mov	x26, x15
240	mov	x27, x19
241	mov	x28, x20
242
243	adds	x3, x3, x8
244	adcs	x4, x4, x9
245	adcs	x5, x5, x10
246	adcs	x6, x6, x11
247	adc	x7, x7, x12
248	b	.Lbeeu_loop
249
250.Lbeeu_loop_end:
251    // The Euclid's algorithm loop ends when A == gcd(a,n);
252    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
253    // Since (-1)*Y*a == A (mod |n|), Y>0
254    // then out = -Y mod n
255
256    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
257    // Is A-1 == 0?
258    // If not, fail.
259	sub	x14, x21, #1
260	orr	x14, x14, x22
261	orr	x14, x14, x23
262	orr	x14, x14, x24
263	cbnz	x14, .Lbeeu_err
264
265    // If Y>n ==> Y:=Y-n
266.Lbeeu_reduction_loop:
267    // x_i := y_i - n_i (X is no longer needed, use it as temp)
268    // (x14 = 0 from above)
269	subs	x3, x8, x0
270	sbcs	x4, x9, x1
271	sbcs	x5, x10, x2
272	sbcs	x6, x11, x30
273	sbcs	x7, x12, x14
274
275    // If result is non-negative (i.e., cs = carry set = no borrow),
276    // y_i := x_i; goto reduce again
277    // else
278    // y_i := y_i; continue
279	csel	x8, x3, x8, cs
280	csel	x9, x4, x9, cs
281	csel	x10, x5, x10, cs
282	csel	x11, x6, x11, cs
283	csel	x12, x7, x12, cs
284	bcs	.Lbeeu_reduction_loop
285
286    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
287    // out = -Y = n-Y
288	subs	x8, x0, x8
289	sbcs	x9, x1, x9
290	sbcs	x10, x2, x10
291	sbcs	x11, x30, x11
292
293    // Save Y in output (out (x0) was saved on the stack)
294	ldr	x3, [sp,#96]
295	stp	x8, x9, [x3]
296	stp	x10, x11, [x3,#16]
297    // return 1 (success)
298	mov	x0, #1
299	b	.Lbeeu_finish
300
301.Lbeeu_err:
302    // return 0 (error)
303	eor	x0, x0, x0
304
305.Lbeeu_finish:
306    // Restore callee-saved registers, except x0, x2
307	add	sp,x29,#0
308	ldp	x19,x20,[sp,#16]
309	ldp	x21,x22,[sp,#32]
310	ldp	x23,x24,[sp,#48]
311	ldp	x25,x26,[sp,#64]
312	ldp	x27,x28,[sp,#80]
313	ldp	x29,x30,[sp],#112
314
315	AARCH64_VALIDATE_LINK_REGISTER
316	ret
317.size	beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
318#endif
319#endif  // !OPENSSL_NO_ASM
320.section	.note.GNU-stack,"",%progbits
321