• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <openssl/bn.h>
2 
3 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS)
4 
5 #include "../internal.h"
6 
7 /* x86_64 BIGNUM accelerator version 0.1, December 2002.
8  *
9  * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10  * project.
11  *
12  * Rights for redistribution and usage in source and binary forms are
13  * granted according to the OpenSSL license. Warranty of any kind is
14  * disclaimed.
15  *
16  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
17  *    versions, like 1.0...
18  * A. Well, that's because this code is basically a quick-n-dirty
19  *    proof-of-concept hack. As you can see it's implemented with
20  *    inline assembler, which means that you're bound to GCC and that
21  *    there might be enough room for further improvement.
22  *
23  * Q. Why inline assembler?
24  * A. x86_64 features own ABI which I'm not familiar with. This is
25  *    why I decided to let the compiler take care of subroutine
26  *    prologue/epilogue as well as register allocation. For reference.
27  *    Win64 implements different ABI for AMD64, different from Linux.
28  *
29  * Q. How much faster does it get?
30  * A. 'apps/openssl speed rsa dsa' output with no-asm:
31  *
32  *	                  sign    verify    sign/s verify/s
33  *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
34  *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
35  *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
36  *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
37  *	                  sign    verify    sign/s verify/s
38  *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
39  *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
40  *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
41  *
42  *    'apps/openssl speed rsa dsa' output with this module:
43  *
44  *	                  sign    verify    sign/s verify/s
45  *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
46  *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
47  *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
48  *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
49  *	                  sign    verify    sign/s verify/s
50  *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
51  *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
52  *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
53  *
54  *    For the reference. IA-32 assembler implementation performs
55  *    very much like 64-bit code compiled with no-asm on the same
56  *    machine.
57  */
58 
59  /* TODO(davidben): Get this file working on Windows x64. */
60 
61 #undef mul
62 #undef mul_add
63 
64 #define asm __asm__
65 
66 /*
67  * "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
68  * "g"(0)		let the compiler to decide where does it
69  *			want to keep the value of zero;
70  */
71 #define mul_add(r, a, word, carry)                                     \
72   do {                                                                 \
73     register BN_ULONG high, low;                                       \
74     asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
75     asm("addq %2,%0; adcq %3,%1"                                       \
76         : "+r"(carry), "+d"(high)                                      \
77         : "a"(low), "g"(0)                                             \
78         : "cc");                                                       \
79     asm("addq %2,%0; adcq %3,%1"                                       \
80         : "+m"(r), "+d"(high)                                          \
81         : "r"(carry), "g"(0)                                           \
82         : "cc");                                                       \
83     carry = high;                                                      \
84   } while (0)
85 
86 #define mul(r, a, word, carry)                                         \
87   do {                                                                 \
88     register BN_ULONG high, low;                                       \
89     asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
90     asm("addq %2,%0; adcq %3,%1"                                       \
91         : "+r"(carry), "+d"(high)                                      \
92         : "a"(low), "g"(0)                                             \
93         : "cc");                                                       \
94     (r) = carry, carry = high;                                         \
95   } while (0)
96 #undef sqr
97 #define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
98 
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)99 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
100                           BN_ULONG w) {
101   BN_ULONG c1 = 0;
102 
103   if (num <= 0) {
104     return (c1);
105   }
106 
107   while (num & ~3) {
108     mul_add(rp[0], ap[0], w, c1);
109     mul_add(rp[1], ap[1], w, c1);
110     mul_add(rp[2], ap[2], w, c1);
111     mul_add(rp[3], ap[3], w, c1);
112     ap += 4;
113     rp += 4;
114     num -= 4;
115   }
116   if (num) {
117     mul_add(rp[0], ap[0], w, c1);
118     if (--num == 0) {
119       return c1;
120     }
121     mul_add(rp[1], ap[1], w, c1);
122     if (--num == 0) {
123       return c1;
124     }
125     mul_add(rp[2], ap[2], w, c1);
126     return c1;
127   }
128 
129   return c1;
130 }
131 
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)132 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
133   BN_ULONG c1 = 0;
134 
135   if (num <= 0) {
136     return c1;
137   }
138 
139   while (num & ~3) {
140     mul(rp[0], ap[0], w, c1);
141     mul(rp[1], ap[1], w, c1);
142     mul(rp[2], ap[2], w, c1);
143     mul(rp[3], ap[3], w, c1);
144     ap += 4;
145     rp += 4;
146     num -= 4;
147   }
148   if (num) {
149     mul(rp[0], ap[0], w, c1);
150     if (--num == 0) {
151       return c1;
152     }
153     mul(rp[1], ap[1], w, c1);
154     if (--num == 0) {
155       return c1;
156     }
157     mul(rp[2], ap[2], w, c1);
158   }
159   return c1;
160 }
161 
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)162 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
163   if (n <= 0) {
164     return;
165   }
166 
167   while (n & ~3) {
168     sqr(r[0], r[1], a[0]);
169     sqr(r[2], r[3], a[1]);
170     sqr(r[4], r[5], a[2]);
171     sqr(r[6], r[7], a[3]);
172     a += 4;
173     r += 8;
174     n -= 4;
175   }
176   if (n) {
177     sqr(r[0], r[1], a[0]);
178     if (--n == 0) {
179       return;
180     }
181     sqr(r[2], r[3], a[1]);
182     if (--n == 0) {
183       return;
184     }
185     sqr(r[4], r[5], a[2]);
186   }
187 }
188 
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)189 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
190   BN_ULONG ret, waste;
191 
192   asm("divq	%4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
193 
194   return ret;
195 }
196 
bn_add_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)197 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
198                       int n) {
199   BN_ULONG ret;
200   size_t i = 0;
201 
202   if (n <= 0) {
203     return 0;
204   }
205 
206   asm volatile (
207       "	subq	%0,%0		\n" /* clear carry */
208       "	jmp	1f		\n"
209       ".p2align 4			\n"
210       "1:	movq	(%4,%2,8),%0	\n"
211       "	adcq	(%5,%2,8),%0	\n"
212       "	movq	%0,(%3,%2,8)	\n"
213       "	lea	1(%2),%2	\n"
214       "	loop	1b		\n"
215       "	sbbq	%0,%0		\n"
216       : "=&r"(ret), "+c"(n), "+r"(i)
217       : "r"(rp), "r"(ap), "r"(bp)
218       : "cc", "memory");
219 
220   return ret & 1;
221 }
222 
223 #ifndef SIMICS
bn_sub_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)224 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
225                       int n) {
226   BN_ULONG ret;
227   size_t i = 0;
228 
229   if (n <= 0) {
230     return 0;
231   }
232 
233   asm volatile (
234       "	subq	%0,%0		\n" /* clear borrow */
235       "	jmp	1f		\n"
236       ".p2align 4			\n"
237       "1:	movq	(%4,%2,8),%0	\n"
238       "	sbbq	(%5,%2,8),%0	\n"
239       "	movq	%0,(%3,%2,8)	\n"
240       "	lea	1(%2),%2	\n"
241       "	loop	1b		\n"
242       "	sbbq	%0,%0		\n"
243       : "=&r"(ret), "+c"(n), "+r"(i)
244       : "r"(rp), "r"(ap), "r"(bp)
245       : "cc", "memory");
246 
247   return ret & 1;
248 }
249 #else
250 /* Simics 1.4<7 has buggy sbbq:-( */
251 #define BN_MASK2 0xffffffffffffffffL
bn_sub_words(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b,int n)252 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
253   BN_ULONG t1, t2;
254   int c = 0;
255 
256   if (n <= 0) {
257     return (BN_ULONG)0;
258   }
259 
260   for (;;) {
261     t1 = a[0];
262     t2 = b[0];
263     r[0] = (t1 - t2 - c) & BN_MASK2;
264     if (t1 != t2) {
265       c = (t1 < t2);
266     }
267     if (--n <= 0) {
268       break;
269     }
270 
271     t1 = a[1];
272     t2 = b[1];
273     r[1] = (t1 - t2 - c) & BN_MASK2;
274     if (t1 != t2) {
275       c = (t1 < t2);
276     }
277     if (--n <= 0) {
278       break;
279     }
280 
281     t1 = a[2];
282     t2 = b[2];
283     r[2] = (t1 - t2 - c) & BN_MASK2;
284     if (t1 != t2) {
285       c = (t1 < t2);
286     }
287     if (--n <= 0) {
288       break;
289     }
290 
291     t1 = a[3];
292     t2 = b[3];
293     r[3] = (t1 - t2 - c) & BN_MASK2;
294     if (t1 != t2) {
295       c = (t1 < t2);
296     }
297     if (--n <= 0) {
298       break;
299     }
300 
301     a += 4;
302     b += 4;
303     r += 4;
304   }
305   return c;
306 }
307 #endif
308 
309 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
310 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
311 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
312 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
313  */
314 
315 /* Keep in mind that carrying into high part of multiplication result can not
316  * overflow, because it cannot be all-ones. */
317 #define mul_add_c(a, b, c0, c1, c2)          \
318   do {                                       \
319     BN_ULONG t1, t2;                \
320     asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
321     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
322         : "+r"(c0), "+r"(c1), "+r"(c2)       \
323         : "r"(t1), "r"(t2), "g"(0)           \
324         : "cc");                             \
325   } while (0)
326 
327 #define sqr_add_c(a, i, c0, c1, c2)          \
328   do {                                       \
329     BN_ULONG t1, t2;                         \
330     asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \
331     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
332         : "+r"(c0), "+r"(c1), "+r"(c2)       \
333         : "r"(t1), "r"(t2), "g"(0)           \
334         : "cc");                             \
335   } while (0)
336 
337 #define mul_add_c2(a, b, c0, c1, c2)         \
338   do {                                       \
339     BN_ULONG t1, t2;                                                    \
340     asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc");        \
341     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
342         : "+r"(c0), "+r"(c1), "+r"(c2)       \
343         : "r"(t1), "r"(t2), "g"(0)           \
344         : "cc");                             \
345     asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
346         : "+r"(c0), "+r"(c1), "+r"(c2)       \
347         : "r"(t1), "r"(t2), "g"(0)           \
348         : "cc");                             \
349   } while (0)
350 
351 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
352 
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)353 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
354   BN_ULONG c1, c2, c3;
355 
356   c1 = 0;
357   c2 = 0;
358   c3 = 0;
359   mul_add_c(a[0], b[0], c1, c2, c3);
360   r[0] = c1;
361   c1 = 0;
362   mul_add_c(a[0], b[1], c2, c3, c1);
363   mul_add_c(a[1], b[0], c2, c3, c1);
364   r[1] = c2;
365   c2 = 0;
366   mul_add_c(a[2], b[0], c3, c1, c2);
367   mul_add_c(a[1], b[1], c3, c1, c2);
368   mul_add_c(a[0], b[2], c3, c1, c2);
369   r[2] = c3;
370   c3 = 0;
371   mul_add_c(a[0], b[3], c1, c2, c3);
372   mul_add_c(a[1], b[2], c1, c2, c3);
373   mul_add_c(a[2], b[1], c1, c2, c3);
374   mul_add_c(a[3], b[0], c1, c2, c3);
375   r[3] = c1;
376   c1 = 0;
377   mul_add_c(a[4], b[0], c2, c3, c1);
378   mul_add_c(a[3], b[1], c2, c3, c1);
379   mul_add_c(a[2], b[2], c2, c3, c1);
380   mul_add_c(a[1], b[3], c2, c3, c1);
381   mul_add_c(a[0], b[4], c2, c3, c1);
382   r[4] = c2;
383   c2 = 0;
384   mul_add_c(a[0], b[5], c3, c1, c2);
385   mul_add_c(a[1], b[4], c3, c1, c2);
386   mul_add_c(a[2], b[3], c3, c1, c2);
387   mul_add_c(a[3], b[2], c3, c1, c2);
388   mul_add_c(a[4], b[1], c3, c1, c2);
389   mul_add_c(a[5], b[0], c3, c1, c2);
390   r[5] = c3;
391   c3 = 0;
392   mul_add_c(a[6], b[0], c1, c2, c3);
393   mul_add_c(a[5], b[1], c1, c2, c3);
394   mul_add_c(a[4], b[2], c1, c2, c3);
395   mul_add_c(a[3], b[3], c1, c2, c3);
396   mul_add_c(a[2], b[4], c1, c2, c3);
397   mul_add_c(a[1], b[5], c1, c2, c3);
398   mul_add_c(a[0], b[6], c1, c2, c3);
399   r[6] = c1;
400   c1 = 0;
401   mul_add_c(a[0], b[7], c2, c3, c1);
402   mul_add_c(a[1], b[6], c2, c3, c1);
403   mul_add_c(a[2], b[5], c2, c3, c1);
404   mul_add_c(a[3], b[4], c2, c3, c1);
405   mul_add_c(a[4], b[3], c2, c3, c1);
406   mul_add_c(a[5], b[2], c2, c3, c1);
407   mul_add_c(a[6], b[1], c2, c3, c1);
408   mul_add_c(a[7], b[0], c2, c3, c1);
409   r[7] = c2;
410   c2 = 0;
411   mul_add_c(a[7], b[1], c3, c1, c2);
412   mul_add_c(a[6], b[2], c3, c1, c2);
413   mul_add_c(a[5], b[3], c3, c1, c2);
414   mul_add_c(a[4], b[4], c3, c1, c2);
415   mul_add_c(a[3], b[5], c3, c1, c2);
416   mul_add_c(a[2], b[6], c3, c1, c2);
417   mul_add_c(a[1], b[7], c3, c1, c2);
418   r[8] = c3;
419   c3 = 0;
420   mul_add_c(a[2], b[7], c1, c2, c3);
421   mul_add_c(a[3], b[6], c1, c2, c3);
422   mul_add_c(a[4], b[5], c1, c2, c3);
423   mul_add_c(a[5], b[4], c1, c2, c3);
424   mul_add_c(a[6], b[3], c1, c2, c3);
425   mul_add_c(a[7], b[2], c1, c2, c3);
426   r[9] = c1;
427   c1 = 0;
428   mul_add_c(a[7], b[3], c2, c3, c1);
429   mul_add_c(a[6], b[4], c2, c3, c1);
430   mul_add_c(a[5], b[5], c2, c3, c1);
431   mul_add_c(a[4], b[6], c2, c3, c1);
432   mul_add_c(a[3], b[7], c2, c3, c1);
433   r[10] = c2;
434   c2 = 0;
435   mul_add_c(a[4], b[7], c3, c1, c2);
436   mul_add_c(a[5], b[6], c3, c1, c2);
437   mul_add_c(a[6], b[5], c3, c1, c2);
438   mul_add_c(a[7], b[4], c3, c1, c2);
439   r[11] = c3;
440   c3 = 0;
441   mul_add_c(a[7], b[5], c1, c2, c3);
442   mul_add_c(a[6], b[6], c1, c2, c3);
443   mul_add_c(a[5], b[7], c1, c2, c3);
444   r[12] = c1;
445   c1 = 0;
446   mul_add_c(a[6], b[7], c2, c3, c1);
447   mul_add_c(a[7], b[6], c2, c3, c1);
448   r[13] = c2;
449   c2 = 0;
450   mul_add_c(a[7], b[7], c3, c1, c2);
451   r[14] = c3;
452   r[15] = c1;
453 }
454 
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)455 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
456   BN_ULONG c1, c2, c3;
457 
458   c1 = 0;
459   c2 = 0;
460   c3 = 0;
461   mul_add_c(a[0], b[0], c1, c2, c3);
462   r[0] = c1;
463   c1 = 0;
464   mul_add_c(a[0], b[1], c2, c3, c1);
465   mul_add_c(a[1], b[0], c2, c3, c1);
466   r[1] = c2;
467   c2 = 0;
468   mul_add_c(a[2], b[0], c3, c1, c2);
469   mul_add_c(a[1], b[1], c3, c1, c2);
470   mul_add_c(a[0], b[2], c3, c1, c2);
471   r[2] = c3;
472   c3 = 0;
473   mul_add_c(a[0], b[3], c1, c2, c3);
474   mul_add_c(a[1], b[2], c1, c2, c3);
475   mul_add_c(a[2], b[1], c1, c2, c3);
476   mul_add_c(a[3], b[0], c1, c2, c3);
477   r[3] = c1;
478   c1 = 0;
479   mul_add_c(a[3], b[1], c2, c3, c1);
480   mul_add_c(a[2], b[2], c2, c3, c1);
481   mul_add_c(a[1], b[3], c2, c3, c1);
482   r[4] = c2;
483   c2 = 0;
484   mul_add_c(a[2], b[3], c3, c1, c2);
485   mul_add_c(a[3], b[2], c3, c1, c2);
486   r[5] = c3;
487   c3 = 0;
488   mul_add_c(a[3], b[3], c1, c2, c3);
489   r[6] = c1;
490   r[7] = c2;
491 }
492 
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)493 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
494   BN_ULONG c1, c2, c3;
495 
496   c1 = 0;
497   c2 = 0;
498   c3 = 0;
499   sqr_add_c(a, 0, c1, c2, c3);
500   r[0] = c1;
501   c1 = 0;
502   sqr_add_c2(a, 1, 0, c2, c3, c1);
503   r[1] = c2;
504   c2 = 0;
505   sqr_add_c(a, 1, c3, c1, c2);
506   sqr_add_c2(a, 2, 0, c3, c1, c2);
507   r[2] = c3;
508   c3 = 0;
509   sqr_add_c2(a, 3, 0, c1, c2, c3);
510   sqr_add_c2(a, 2, 1, c1, c2, c3);
511   r[3] = c1;
512   c1 = 0;
513   sqr_add_c(a, 2, c2, c3, c1);
514   sqr_add_c2(a, 3, 1, c2, c3, c1);
515   sqr_add_c2(a, 4, 0, c2, c3, c1);
516   r[4] = c2;
517   c2 = 0;
518   sqr_add_c2(a, 5, 0, c3, c1, c2);
519   sqr_add_c2(a, 4, 1, c3, c1, c2);
520   sqr_add_c2(a, 3, 2, c3, c1, c2);
521   r[5] = c3;
522   c3 = 0;
523   sqr_add_c(a, 3, c1, c2, c3);
524   sqr_add_c2(a, 4, 2, c1, c2, c3);
525   sqr_add_c2(a, 5, 1, c1, c2, c3);
526   sqr_add_c2(a, 6, 0, c1, c2, c3);
527   r[6] = c1;
528   c1 = 0;
529   sqr_add_c2(a, 7, 0, c2, c3, c1);
530   sqr_add_c2(a, 6, 1, c2, c3, c1);
531   sqr_add_c2(a, 5, 2, c2, c3, c1);
532   sqr_add_c2(a, 4, 3, c2, c3, c1);
533   r[7] = c2;
534   c2 = 0;
535   sqr_add_c(a, 4, c3, c1, c2);
536   sqr_add_c2(a, 5, 3, c3, c1, c2);
537   sqr_add_c2(a, 6, 2, c3, c1, c2);
538   sqr_add_c2(a, 7, 1, c3, c1, c2);
539   r[8] = c3;
540   c3 = 0;
541   sqr_add_c2(a, 7, 2, c1, c2, c3);
542   sqr_add_c2(a, 6, 3, c1, c2, c3);
543   sqr_add_c2(a, 5, 4, c1, c2, c3);
544   r[9] = c1;
545   c1 = 0;
546   sqr_add_c(a, 5, c2, c3, c1);
547   sqr_add_c2(a, 6, 4, c2, c3, c1);
548   sqr_add_c2(a, 7, 3, c2, c3, c1);
549   r[10] = c2;
550   c2 = 0;
551   sqr_add_c2(a, 7, 4, c3, c1, c2);
552   sqr_add_c2(a, 6, 5, c3, c1, c2);
553   r[11] = c3;
554   c3 = 0;
555   sqr_add_c(a, 6, c1, c2, c3);
556   sqr_add_c2(a, 7, 5, c1, c2, c3);
557   r[12] = c1;
558   c1 = 0;
559   sqr_add_c2(a, 7, 6, c2, c3, c1);
560   r[13] = c2;
561   c2 = 0;
562   sqr_add_c(a, 7, c3, c1, c2);
563   r[14] = c3;
564   r[15] = c1;
565 }
566 
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)567 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
568   BN_ULONG c1, c2, c3;
569 
570   c1 = 0;
571   c2 = 0;
572   c3 = 0;
573   sqr_add_c(a, 0, c1, c2, c3);
574   r[0] = c1;
575   c1 = 0;
576   sqr_add_c2(a, 1, 0, c2, c3, c1);
577   r[1] = c2;
578   c2 = 0;
579   sqr_add_c(a, 1, c3, c1, c2);
580   sqr_add_c2(a, 2, 0, c3, c1, c2);
581   r[2] = c3;
582   c3 = 0;
583   sqr_add_c2(a, 3, 0, c1, c2, c3);
584   sqr_add_c2(a, 2, 1, c1, c2, c3);
585   r[3] = c1;
586   c1 = 0;
587   sqr_add_c(a, 2, c2, c3, c1);
588   sqr_add_c2(a, 3, 1, c2, c3, c1);
589   r[4] = c2;
590   c2 = 0;
591   sqr_add_c2(a, 3, 2, c3, c1, c2);
592   r[5] = c3;
593   c3 = 0;
594   sqr_add_c(a, 3, c1, c2, c3);
595   r[6] = c1;
596   r[7] = c2;
597 }
598 
599 #endif  /* !NO_ASM && X86_64 && !WINDOWS */
600