• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2  * All rights reserved.
3  *
4  * This package is an SSL implementation written
5  * by Eric Young (eay@cryptsoft.com).
6  * The implementation was written so as to conform with Netscapes SSL.
7  *
8  * This library is free for commercial and non-commercial use as long as
9  * the following conditions are aheared to.  The following conditions
10  * apply to all code found in this distribution, be it the RC4, RSA,
11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
12  * included with this distribution is covered by the same copyright terms
13  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14  *
15  * Copyright remains Eric Young's, and as such any Copyright notices in
16  * the code are not to be removed.
17  * If this package is used in a product, Eric Young should be given attribution
18  * as the author of the parts of the library used.
19  * This can be in the form of a textual message at program startup or
20  * in documentation (online or textual) provided with the package.
21  *
22  * Redistribution and use in source and binary forms, with or without
23  * modification, are permitted provided that the following conditions
24  * are met:
25  * 1. Redistributions of source code must retain the copyright
26  *    notice, this list of conditions and the following disclaimer.
27  * 2. Redistributions in binary form must reproduce the above copyright
28  *    notice, this list of conditions and the following disclaimer in the
29  *    documentation and/or other materials provided with the distribution.
30  * 3. All advertising materials mentioning features or use of this software
31  *    must display the following acknowledgement:
32  *    "This product includes cryptographic software written by
33  *     Eric Young (eay@cryptsoft.com)"
34  *    The word 'cryptographic' can be left out if the rouines from the library
35  *    being used are not cryptographic related :-).
36  * 4. If you include any Windows specific code (or a derivative thereof) from
37  *    the apps directory (application code) you must include an acknowledgement:
38  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39  *
40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  * The licence and distribution terms for any publically available version or
53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
54  * copied and put under another distribution licence
55  * [including the GNU Public Licence.] */
56 
57 #include <openssl/bn.h>
58 
59 #include <assert.h>
60 
61 #include "internal.h"
62 
63 
64 /* Generic implementations of most operations are needed for:
65  * - Configurations without inline assembly.
66  * - Architectures other than x86 or x86_64.
67  * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
68 #if defined(OPENSSL_NO_ASM) || \
69     (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
70     (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
71 
72 #if defined(OPENSSL_WINDOWS)
73 #define alloca _alloca
74 #else
75 #include <alloca.h>
76 #endif
77 
78 #ifdef BN_LLONG
79 #define mul_add(r, a, w, c)             \
80   {                                     \
81     BN_ULLONG t;                        \
82     t = (BN_ULLONG)w * (a) + (r) + (c); \
83     (r) = Lw(t);                        \
84     (c) = Hw(t);                        \
85   }
86 
87 #define mul(r, a, w, c)           \
88   {                               \
89     BN_ULLONG t;                  \
90     t = (BN_ULLONG)w * (a) + (c); \
91     (r) = Lw(t);                  \
92     (c) = Hw(t);                  \
93   }
94 
95 #define sqr(r0, r1, a)        \
96   {                           \
97     BN_ULLONG t;              \
98     t = (BN_ULLONG)(a) * (a); \
99     (r0) = Lw(t);             \
100     (r1) = Hw(t);             \
101   }
102 
103 #elif defined(BN_UMULT_LOHI)
104 #define mul_add(r, a, w, c)             \
105   {                                     \
106     BN_ULONG high, low, ret, tmp = (a); \
107     ret = (r);                          \
108     BN_UMULT_LOHI(low, high, w, tmp);   \
109     ret += (c);                         \
110     (c) = (ret < (c)) ? 1 : 0;          \
111     (c) += high;                        \
112     ret += low;                         \
113     (c) += (ret < low) ? 1 : 0;         \
114     (r) = ret;                          \
115   }
116 
117 #define mul(r, a, w, c)                \
118   {                                    \
119     BN_ULONG high, low, ret, ta = (a); \
120     BN_UMULT_LOHI(low, high, w, ta);   \
121     ret = low + (c);                   \
122     (c) = high;                        \
123     (c) += (ret < low) ? 1 : 0;        \
124     (r) = ret;                         \
125   }
126 
127 #define sqr(r0, r1, a)               \
128   {                                  \
129     BN_ULONG tmp = (a);              \
130     BN_UMULT_LOHI(r0, r1, tmp, tmp); \
131   }
132 
133 #else
134 
135 /*************************************************************
136  * No long long type
137  */
138 
139 #define LBITS(a) ((a) & BN_MASK2l)
140 #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
141 #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
142 
143 #define LLBITS(a) ((a) & BN_MASKl)
144 #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
145 #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
146 
147 #define mul64(l, h, bl, bh)       \
148   {                               \
149     BN_ULONG m, m1, lt, ht;       \
150                                   \
151     lt = l;                       \
152     ht = h;                       \
153     m = (bh) * (lt);              \
154     lt = (bl) * (lt);             \
155     m1 = (bl) * (ht);             \
156     ht = (bh) * (ht);             \
157     m = (m + m1) & BN_MASK2;      \
158     if (m < m1)                   \
159       ht += L2HBITS((BN_ULONG)1); \
160     ht += HBITS(m);               \
161     m1 = L2HBITS(m);              \
162     lt = (lt + m1) & BN_MASK2;    \
163     if (lt < m1)                  \
164       ht++;                       \
165     (l) = lt;                     \
166     (h) = ht;                     \
167   }
168 
169 #define sqr64(lo, ho, in)                    \
170   {                                          \
171     BN_ULONG l, h, m;                        \
172                                              \
173     h = (in);                                \
174     l = LBITS(h);                            \
175     h = HBITS(h);                            \
176     m = (l) * (h);                           \
177     l *= l;                                  \
178     h *= h;                                  \
179     h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
180     m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
181     l = (l + m) & BN_MASK2;                  \
182     if (l < m)                               \
183       h++;                                   \
184     (lo) = l;                                \
185     (ho) = h;                                \
186   }
187 
188 #define mul_add(r, a, bl, bh, c) \
189   {                              \
190     BN_ULONG l, h;               \
191                                  \
192     h = (a);                     \
193     l = LBITS(h);                \
194     h = HBITS(h);                \
195     mul64(l, h, (bl), (bh));     \
196                                  \
197     /* non-multiply part */      \
198     l = (l + (c)) & BN_MASK2;    \
199     if (l < (c))                 \
200       h++;                       \
201     (c) = (r);                   \
202     l = (l + (c)) & BN_MASK2;    \
203     if (l < (c))                 \
204       h++;                       \
205     (c) = h & BN_MASK2;          \
206     (r) = l;                     \
207   }
208 
209 #define mul(r, a, bl, bh, c)  \
210   {                           \
211     BN_ULONG l, h;            \
212                               \
213     h = (a);                  \
214     l = LBITS(h);             \
215     h = HBITS(h);             \
216     mul64(l, h, (bl), (bh));  \
217                               \
218     /* non-multiply part */   \
219     l += (c);                 \
220     if ((l & BN_MASK2) < (c)) \
221       h++;                    \
222     (c) = h & BN_MASK2;       \
223     (r) = l & BN_MASK2;       \
224   }
225 #endif /* !BN_LLONG */
226 
227 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
228 
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)229 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
230                           BN_ULONG w) {
231   BN_ULONG c1 = 0;
232 
233   assert(num >= 0);
234   if (num <= 0) {
235     return c1;
236   }
237 
238   while (num & ~3) {
239     mul_add(rp[0], ap[0], w, c1);
240     mul_add(rp[1], ap[1], w, c1);
241     mul_add(rp[2], ap[2], w, c1);
242     mul_add(rp[3], ap[3], w, c1);
243     ap += 4;
244     rp += 4;
245     num -= 4;
246   }
247 
248   while (num) {
249     mul_add(rp[0], ap[0], w, c1);
250     ap++;
251     rp++;
252     num--;
253   }
254 
255   return c1;
256 }
257 
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)258 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
259   BN_ULONG c1 = 0;
260 
261   assert(num >= 0);
262   if (num <= 0) {
263     return c1;
264   }
265 
266   while (num & ~3) {
267     mul(rp[0], ap[0], w, c1);
268     mul(rp[1], ap[1], w, c1);
269     mul(rp[2], ap[2], w, c1);
270     mul(rp[3], ap[3], w, c1);
271     ap += 4;
272     rp += 4;
273     num -= 4;
274   }
275   while (num) {
276     mul(rp[0], ap[0], w, c1);
277     ap++;
278     rp++;
279     num--;
280   }
281   return c1;
282 }
283 
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)284 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
285   assert(n >= 0);
286   if (n <= 0) {
287     return;
288   }
289 
290   while (n & ~3) {
291     sqr(r[0], r[1], a[0]);
292     sqr(r[2], r[3], a[1]);
293     sqr(r[4], r[5], a[2]);
294     sqr(r[6], r[7], a[3]);
295     a += 4;
296     r += 8;
297     n -= 4;
298   }
299   while (n) {
300     sqr(r[0], r[1], a[0]);
301     a++;
302     r += 2;
303     n--;
304   }
305 }
306 
307 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
308 
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)309 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
310                           BN_ULONG w) {
311   BN_ULONG c = 0;
312   BN_ULONG bl, bh;
313 
314   assert(num >= 0);
315   if (num <= 0) {
316     return (BN_ULONG)0;
317   }
318 
319   bl = LBITS(w);
320   bh = HBITS(w);
321 
322   while (num & ~3) {
323     mul_add(rp[0], ap[0], bl, bh, c);
324     mul_add(rp[1], ap[1], bl, bh, c);
325     mul_add(rp[2], ap[2], bl, bh, c);
326     mul_add(rp[3], ap[3], bl, bh, c);
327     ap += 4;
328     rp += 4;
329     num -= 4;
330   }
331   while (num) {
332     mul_add(rp[0], ap[0], bl, bh, c);
333     ap++;
334     rp++;
335     num--;
336   }
337   return c;
338 }
339 
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)340 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
341   BN_ULONG carry = 0;
342   BN_ULONG bl, bh;
343 
344   assert(num >= 0);
345   if (num <= 0) {
346     return (BN_ULONG)0;
347   }
348 
349   bl = LBITS(w);
350   bh = HBITS(w);
351 
352   while (num & ~3) {
353     mul(rp[0], ap[0], bl, bh, carry);
354     mul(rp[1], ap[1], bl, bh, carry);
355     mul(rp[2], ap[2], bl, bh, carry);
356     mul(rp[3], ap[3], bl, bh, carry);
357     ap += 4;
358     rp += 4;
359     num -= 4;
360   }
361   while (num) {
362     mul(rp[0], ap[0], bl, bh, carry);
363     ap++;
364     rp++;
365     num--;
366   }
367   return carry;
368 }
369 
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)370 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
371   assert(n >= 0);
372   if (n <= 0) {
373     return;
374   }
375 
376   while (n & ~3) {
377     sqr64(r[0], r[1], a[0]);
378     sqr64(r[2], r[3], a[1]);
379     sqr64(r[4], r[5], a[2]);
380     sqr64(r[6], r[7], a[3]);
381     a += 4;
382     r += 8;
383     n -= 4;
384   }
385   while (n) {
386     sqr64(r[0], r[1], a[0]);
387     a++;
388     r += 2;
389     n--;
390   }
391 }
392 
393 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
394 
395 #if defined(BN_LLONG)
396 
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)397 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
398   return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
399 }
400 
401 #else
402 
403 /* Divide h,l by d and return the result. */
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)404 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
405   BN_ULONG dh, dl, q, ret = 0, th, tl, t;
406   int i, count = 2;
407 
408   if (d == 0) {
409     return BN_MASK2;
410   }
411 
412   i = BN_num_bits_word(d);
413   assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
414 
415   i = BN_BITS2 - i;
416   if (h >= d) {
417     h -= d;
418   }
419 
420   if (i) {
421     d <<= i;
422     h = (h << i) | (l >> (BN_BITS2 - i));
423     l <<= i;
424   }
425   dh = (d & BN_MASK2h) >> BN_BITS4;
426   dl = (d & BN_MASK2l);
427   for (;;) {
428     if ((h >> BN_BITS4) == dh) {
429       q = BN_MASK2l;
430     } else {
431       q = h / dh;
432     }
433 
434     th = q * dh;
435     tl = dl * q;
436     for (;;) {
437       t = h - th;
438       if ((t & BN_MASK2h) ||
439           ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
440         break;
441       }
442       q--;
443       th -= dh;
444       tl -= dl;
445     }
446     t = (tl >> BN_BITS4);
447     tl = (tl << BN_BITS4) & BN_MASK2h;
448     th += t;
449 
450     if (l < tl) {
451       th++;
452     }
453     l -= tl;
454     if (h < th) {
455       h += d;
456       q--;
457     }
458     h -= th;
459 
460     if (--count == 0) {
461       break;
462     }
463 
464     ret = q << BN_BITS4;
465     h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
466     l = (l & BN_MASK2l) << BN_BITS4;
467   }
468 
469   ret |= q;
470   return ret;
471 }
472 
473 #endif /* !defined(BN_LLONG) */
474 
475 #ifdef BN_LLONG
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)476 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
477                       int n) {
478   BN_ULLONG ll = 0;
479 
480   assert(n >= 0);
481   if (n <= 0) {
482     return (BN_ULONG)0;
483   }
484 
485   while (n & ~3) {
486     ll += (BN_ULLONG)a[0] + b[0];
487     r[0] = (BN_ULONG)ll & BN_MASK2;
488     ll >>= BN_BITS2;
489     ll += (BN_ULLONG)a[1] + b[1];
490     r[1] = (BN_ULONG)ll & BN_MASK2;
491     ll >>= BN_BITS2;
492     ll += (BN_ULLONG)a[2] + b[2];
493     r[2] = (BN_ULONG)ll & BN_MASK2;
494     ll >>= BN_BITS2;
495     ll += (BN_ULLONG)a[3] + b[3];
496     r[3] = (BN_ULONG)ll & BN_MASK2;
497     ll >>= BN_BITS2;
498     a += 4;
499     b += 4;
500     r += 4;
501     n -= 4;
502   }
503   while (n) {
504     ll += (BN_ULLONG)a[0] + b[0];
505     r[0] = (BN_ULONG)ll & BN_MASK2;
506     ll >>= BN_BITS2;
507     a++;
508     b++;
509     r++;
510     n--;
511   }
512   return (BN_ULONG)ll;
513 }
514 
515 #else /* !BN_LLONG */
516 
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)517 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
518                       int n) {
519   BN_ULONG c, l, t;
520 
521   assert(n >= 0);
522   if (n <= 0) {
523     return (BN_ULONG)0;
524   }
525 
526   c = 0;
527   while (n & ~3) {
528     t = a[0];
529     t = (t + c) & BN_MASK2;
530     c = (t < c);
531     l = (t + b[0]) & BN_MASK2;
532     c += (l < t);
533     r[0] = l;
534     t = a[1];
535     t = (t + c) & BN_MASK2;
536     c = (t < c);
537     l = (t + b[1]) & BN_MASK2;
538     c += (l < t);
539     r[1] = l;
540     t = a[2];
541     t = (t + c) & BN_MASK2;
542     c = (t < c);
543     l = (t + b[2]) & BN_MASK2;
544     c += (l < t);
545     r[2] = l;
546     t = a[3];
547     t = (t + c) & BN_MASK2;
548     c = (t < c);
549     l = (t + b[3]) & BN_MASK2;
550     c += (l < t);
551     r[3] = l;
552     a += 4;
553     b += 4;
554     r += 4;
555     n -= 4;
556   }
557   while (n) {
558     t = a[0];
559     t = (t + c) & BN_MASK2;
560     c = (t < c);
561     l = (t + b[0]) & BN_MASK2;
562     c += (l < t);
563     r[0] = l;
564     a++;
565     b++;
566     r++;
567     n--;
568   }
569   return (BN_ULONG)c;
570 }
571 
572 #endif /* !BN_LLONG */
573 
bn_sub_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)574 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
575                       int n) {
576   BN_ULONG t1, t2;
577   int c = 0;
578 
579   assert(n >= 0);
580   if (n <= 0) {
581     return (BN_ULONG)0;
582   }
583 
584   while (n & ~3) {
585     t1 = a[0];
586     t2 = b[0];
587     r[0] = (t1 - t2 - c) & BN_MASK2;
588     if (t1 != t2) {
589       c = (t1 < t2);
590     }
591     t1 = a[1];
592     t2 = b[1];
593     r[1] = (t1 - t2 - c) & BN_MASK2;
594     if (t1 != t2) {
595       c = (t1 < t2);
596     }
597     t1 = a[2];
598     t2 = b[2];
599     r[2] = (t1 - t2 - c) & BN_MASK2;
600     if (t1 != t2) {
601       c = (t1 < t2);
602     }
603     t1 = a[3];
604     t2 = b[3];
605     r[3] = (t1 - t2 - c) & BN_MASK2;
606     if (t1 != t2) {
607       c = (t1 < t2);
608     }
609     a += 4;
610     b += 4;
611     r += 4;
612     n -= 4;
613   }
614   while (n) {
615     t1 = a[0];
616     t2 = b[0];
617     r[0] = (t1 - t2 - c) & BN_MASK2;
618     if (t1 != t2) {
619       c = (t1 < t2);
620     }
621     a++;
622     b++;
623     r++;
624     n--;
625   }
626   return c;
627 }
628 
629 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
630 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
631 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
632 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
633 
634 #ifdef BN_LLONG
635 
636 /* Keep in mind that additions to multiplication result can not overflow,
637  * because its high half cannot be all-ones. */
638 #define mul_add_c(a, b, c0, c1, c2)     \
639   do {                                  \
640     BN_ULONG hi;                        \
641     BN_ULLONG t = (BN_ULLONG)(a) * (b); \
642     t += c0; /* no carry */             \
643     c0 = (BN_ULONG)Lw(t);               \
644     hi = (BN_ULONG)Hw(t);               \
645     c1 = (c1 + hi) & BN_MASK2;          \
646     if (c1 < hi)                        \
647       c2++;                             \
648   } while (0)
649 
650 #define mul_add_c2(a, b, c0, c1, c2)      \
651   do {                                    \
652     BN_ULONG hi;                          \
653     BN_ULLONG t = (BN_ULLONG)(a) * (b);   \
654     BN_ULLONG tt = t + c0; /* no carry */ \
655     c0 = (BN_ULONG)Lw(tt);                \
656     hi = (BN_ULONG)Hw(tt);                \
657     c1 = (c1 + hi) & BN_MASK2;            \
658     if (c1 < hi)                          \
659       c2++;                               \
660     t += c0; /* no carry */               \
661     c0 = (BN_ULONG)Lw(t);                 \
662     hi = (BN_ULONG)Hw(t);                 \
663     c1 = (c1 + hi) & BN_MASK2;            \
664     if (c1 < hi)                          \
665       c2++;                               \
666   } while (0)
667 
668 #define sqr_add_c(a, i, c0, c1, c2)       \
669   do {                                    \
670     BN_ULONG hi;                          \
671     BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
672     t += c0; /* no carry */               \
673     c0 = (BN_ULONG)Lw(t);                 \
674     hi = (BN_ULONG)Hw(t);                 \
675     c1 = (c1 + hi) & BN_MASK2;            \
676     if (c1 < hi)                          \
677       c2++;                               \
678   } while (0)
679 
680 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
681 
682 #elif defined(BN_UMULT_LOHI)
683 
684 /* Keep in mind that additions to hi can not overflow, because the high word of
685  * a multiplication result cannot be all-ones. */
686 #define mul_add_c(a, b, c0, c1, c2) \
687   do {                              \
688     BN_ULONG ta = (a), tb = (b);    \
689     BN_ULONG lo, hi;                \
690     BN_UMULT_LOHI(lo, hi, ta, tb);  \
691     c0 += lo;                       \
692     hi += (c0 < lo) ? 1 : 0;        \
693     c1 += hi;                       \
694     c2 += (c1 < hi) ? 1 : 0;        \
695   } while (0)
696 
697 #define mul_add_c2(a, b, c0, c1, c2) \
698   do {                               \
699     BN_ULONG ta = (a), tb = (b);     \
700     BN_ULONG lo, hi, tt;             \
701     BN_UMULT_LOHI(lo, hi, ta, tb);   \
702     c0 += lo;                        \
703     tt = hi + ((c0 < lo) ? 1 : 0);   \
704     c1 += tt;                        \
705     c2 += (c1 < tt) ? 1 : 0;         \
706     c0 += lo;                        \
707     hi += (c0 < lo) ? 1 : 0;         \
708     c1 += hi;                        \
709     c2 += (c1 < hi) ? 1 : 0;         \
710   } while (0)
711 
712 #define sqr_add_c(a, i, c0, c1, c2) \
713   do {                              \
714     BN_ULONG ta = (a)[i];           \
715     BN_ULONG lo, hi;                \
716     BN_UMULT_LOHI(lo, hi, ta, ta);  \
717     c0 += lo;                       \
718     hi += (c0 < lo) ? 1 : 0;        \
719     c1 += hi;                       \
720     c2 += (c1 < hi) ? 1 : 0;        \
721   } while (0)
722 
723 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
724 
725 #else /* !BN_LLONG */
726 
727 /* Keep in mind that additions to hi can not overflow, because
728  * the high word of a multiplication result cannot be all-ones. */
729 
730 #define mul_add_c(a, b, c0, c1, c2)        \
731   do {                                     \
732     BN_ULONG lo = LBITS(a), hi = HBITS(a); \
733     BN_ULONG bl = LBITS(b), bh = HBITS(b); \
734     mul64(lo, hi, bl, bh);                 \
735     c0 = (c0 + lo) & BN_MASK2;             \
736     if (c0 < lo)                           \
737       hi++;                                \
738     c1 = (c1 + hi) & BN_MASK2;             \
739     if (c1 < hi)                           \
740       c2++;                                \
741   } while (0)
742 
743 #define mul_add_c2(a, b, c0, c1, c2)       \
744   do {                                     \
745     BN_ULONG tt;                           \
746     BN_ULONG lo = LBITS(a), hi = HBITS(a); \
747     BN_ULONG bl = LBITS(b), bh = HBITS(b); \
748     mul64(lo, hi, bl, bh);                 \
749     tt = hi;                               \
750     c0 = (c0 + lo) & BN_MASK2;             \
751     if (c0 < lo)                           \
752       tt++;                                \
753     c1 = (c1 + tt) & BN_MASK2;             \
754     if (c1 < tt)                           \
755       c2++;                                \
756     c0 = (c0 + lo) & BN_MASK2;             \
757     if (c0 < lo)                           \
758       hi++;                                \
759     c1 = (c1 + hi) & BN_MASK2;             \
760     if (c1 < hi)                           \
761       c2++;                                \
762   } while (0)
763 
764 #define sqr_add_c(a, i, c0, c1, c2) \
765   do {                              \
766     BN_ULONG lo, hi;                \
767     sqr64(lo, hi, (a)[i]);          \
768     c0 = (c0 + lo) & BN_MASK2;      \
769     if (c0 < lo)                    \
770       hi++;                         \
771     c1 = (c1 + hi) & BN_MASK2;      \
772     if (c1 < hi)                    \
773       c2++;                         \
774   } while (0)
775 
776 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
777 #endif /* !BN_LLONG */
778 
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)779 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
780   BN_ULONG c1, c2, c3;
781 
782   c1 = 0;
783   c2 = 0;
784   c3 = 0;
785   mul_add_c(a[0], b[0], c1, c2, c3);
786   r[0] = c1;
787   c1 = 0;
788   mul_add_c(a[0], b[1], c2, c3, c1);
789   mul_add_c(a[1], b[0], c2, c3, c1);
790   r[1] = c2;
791   c2 = 0;
792   mul_add_c(a[2], b[0], c3, c1, c2);
793   mul_add_c(a[1], b[1], c3, c1, c2);
794   mul_add_c(a[0], b[2], c3, c1, c2);
795   r[2] = c3;
796   c3 = 0;
797   mul_add_c(a[0], b[3], c1, c2, c3);
798   mul_add_c(a[1], b[2], c1, c2, c3);
799   mul_add_c(a[2], b[1], c1, c2, c3);
800   mul_add_c(a[3], b[0], c1, c2, c3);
801   r[3] = c1;
802   c1 = 0;
803   mul_add_c(a[4], b[0], c2, c3, c1);
804   mul_add_c(a[3], b[1], c2, c3, c1);
805   mul_add_c(a[2], b[2], c2, c3, c1);
806   mul_add_c(a[1], b[3], c2, c3, c1);
807   mul_add_c(a[0], b[4], c2, c3, c1);
808   r[4] = c2;
809   c2 = 0;
810   mul_add_c(a[0], b[5], c3, c1, c2);
811   mul_add_c(a[1], b[4], c3, c1, c2);
812   mul_add_c(a[2], b[3], c3, c1, c2);
813   mul_add_c(a[3], b[2], c3, c1, c2);
814   mul_add_c(a[4], b[1], c3, c1, c2);
815   mul_add_c(a[5], b[0], c3, c1, c2);
816   r[5] = c3;
817   c3 = 0;
818   mul_add_c(a[6], b[0], c1, c2, c3);
819   mul_add_c(a[5], b[1], c1, c2, c3);
820   mul_add_c(a[4], b[2], c1, c2, c3);
821   mul_add_c(a[3], b[3], c1, c2, c3);
822   mul_add_c(a[2], b[4], c1, c2, c3);
823   mul_add_c(a[1], b[5], c1, c2, c3);
824   mul_add_c(a[0], b[6], c1, c2, c3);
825   r[6] = c1;
826   c1 = 0;
827   mul_add_c(a[0], b[7], c2, c3, c1);
828   mul_add_c(a[1], b[6], c2, c3, c1);
829   mul_add_c(a[2], b[5], c2, c3, c1);
830   mul_add_c(a[3], b[4], c2, c3, c1);
831   mul_add_c(a[4], b[3], c2, c3, c1);
832   mul_add_c(a[5], b[2], c2, c3, c1);
833   mul_add_c(a[6], b[1], c2, c3, c1);
834   mul_add_c(a[7], b[0], c2, c3, c1);
835   r[7] = c2;
836   c2 = 0;
837   mul_add_c(a[7], b[1], c3, c1, c2);
838   mul_add_c(a[6], b[2], c3, c1, c2);
839   mul_add_c(a[5], b[3], c3, c1, c2);
840   mul_add_c(a[4], b[4], c3, c1, c2);
841   mul_add_c(a[3], b[5], c3, c1, c2);
842   mul_add_c(a[2], b[6], c3, c1, c2);
843   mul_add_c(a[1], b[7], c3, c1, c2);
844   r[8] = c3;
845   c3 = 0;
846   mul_add_c(a[2], b[7], c1, c2, c3);
847   mul_add_c(a[3], b[6], c1, c2, c3);
848   mul_add_c(a[4], b[5], c1, c2, c3);
849   mul_add_c(a[5], b[4], c1, c2, c3);
850   mul_add_c(a[6], b[3], c1, c2, c3);
851   mul_add_c(a[7], b[2], c1, c2, c3);
852   r[9] = c1;
853   c1 = 0;
854   mul_add_c(a[7], b[3], c2, c3, c1);
855   mul_add_c(a[6], b[4], c2, c3, c1);
856   mul_add_c(a[5], b[5], c2, c3, c1);
857   mul_add_c(a[4], b[6], c2, c3, c1);
858   mul_add_c(a[3], b[7], c2, c3, c1);
859   r[10] = c2;
860   c2 = 0;
861   mul_add_c(a[4], b[7], c3, c1, c2);
862   mul_add_c(a[5], b[6], c3, c1, c2);
863   mul_add_c(a[6], b[5], c3, c1, c2);
864   mul_add_c(a[7], b[4], c3, c1, c2);
865   r[11] = c3;
866   c3 = 0;
867   mul_add_c(a[7], b[5], c1, c2, c3);
868   mul_add_c(a[6], b[6], c1, c2, c3);
869   mul_add_c(a[5], b[7], c1, c2, c3);
870   r[12] = c1;
871   c1 = 0;
872   mul_add_c(a[6], b[7], c2, c3, c1);
873   mul_add_c(a[7], b[6], c2, c3, c1);
874   r[13] = c2;
875   c2 = 0;
876   mul_add_c(a[7], b[7], c3, c1, c2);
877   r[14] = c3;
878   r[15] = c1;
879 }
880 
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)881 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
882   BN_ULONG c1, c2, c3;
883 
884   c1 = 0;
885   c2 = 0;
886   c3 = 0;
887   mul_add_c(a[0], b[0], c1, c2, c3);
888   r[0] = c1;
889   c1 = 0;
890   mul_add_c(a[0], b[1], c2, c3, c1);
891   mul_add_c(a[1], b[0], c2, c3, c1);
892   r[1] = c2;
893   c2 = 0;
894   mul_add_c(a[2], b[0], c3, c1, c2);
895   mul_add_c(a[1], b[1], c3, c1, c2);
896   mul_add_c(a[0], b[2], c3, c1, c2);
897   r[2] = c3;
898   c3 = 0;
899   mul_add_c(a[0], b[3], c1, c2, c3);
900   mul_add_c(a[1], b[2], c1, c2, c3);
901   mul_add_c(a[2], b[1], c1, c2, c3);
902   mul_add_c(a[3], b[0], c1, c2, c3);
903   r[3] = c1;
904   c1 = 0;
905   mul_add_c(a[3], b[1], c2, c3, c1);
906   mul_add_c(a[2], b[2], c2, c3, c1);
907   mul_add_c(a[1], b[3], c2, c3, c1);
908   r[4] = c2;
909   c2 = 0;
910   mul_add_c(a[2], b[3], c3, c1, c2);
911   mul_add_c(a[3], b[2], c3, c1, c2);
912   r[5] = c3;
913   c3 = 0;
914   mul_add_c(a[3], b[3], c1, c2, c3);
915   r[6] = c1;
916   r[7] = c2;
917 }
918 
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)919 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
920   BN_ULONG c1, c2, c3;
921 
922   c1 = 0;
923   c2 = 0;
924   c3 = 0;
925   sqr_add_c(a, 0, c1, c2, c3);
926   r[0] = c1;
927   c1 = 0;
928   sqr_add_c2(a, 1, 0, c2, c3, c1);
929   r[1] = c2;
930   c2 = 0;
931   sqr_add_c(a, 1, c3, c1, c2);
932   sqr_add_c2(a, 2, 0, c3, c1, c2);
933   r[2] = c3;
934   c3 = 0;
935   sqr_add_c2(a, 3, 0, c1, c2, c3);
936   sqr_add_c2(a, 2, 1, c1, c2, c3);
937   r[3] = c1;
938   c1 = 0;
939   sqr_add_c(a, 2, c2, c3, c1);
940   sqr_add_c2(a, 3, 1, c2, c3, c1);
941   sqr_add_c2(a, 4, 0, c2, c3, c1);
942   r[4] = c2;
943   c2 = 0;
944   sqr_add_c2(a, 5, 0, c3, c1, c2);
945   sqr_add_c2(a, 4, 1, c3, c1, c2);
946   sqr_add_c2(a, 3, 2, c3, c1, c2);
947   r[5] = c3;
948   c3 = 0;
949   sqr_add_c(a, 3, c1, c2, c3);
950   sqr_add_c2(a, 4, 2, c1, c2, c3);
951   sqr_add_c2(a, 5, 1, c1, c2, c3);
952   sqr_add_c2(a, 6, 0, c1, c2, c3);
953   r[6] = c1;
954   c1 = 0;
955   sqr_add_c2(a, 7, 0, c2, c3, c1);
956   sqr_add_c2(a, 6, 1, c2, c3, c1);
957   sqr_add_c2(a, 5, 2, c2, c3, c1);
958   sqr_add_c2(a, 4, 3, c2, c3, c1);
959   r[7] = c2;
960   c2 = 0;
961   sqr_add_c(a, 4, c3, c1, c2);
962   sqr_add_c2(a, 5, 3, c3, c1, c2);
963   sqr_add_c2(a, 6, 2, c3, c1, c2);
964   sqr_add_c2(a, 7, 1, c3, c1, c2);
965   r[8] = c3;
966   c3 = 0;
967   sqr_add_c2(a, 7, 2, c1, c2, c3);
968   sqr_add_c2(a, 6, 3, c1, c2, c3);
969   sqr_add_c2(a, 5, 4, c1, c2, c3);
970   r[9] = c1;
971   c1 = 0;
972   sqr_add_c(a, 5, c2, c3, c1);
973   sqr_add_c2(a, 6, 4, c2, c3, c1);
974   sqr_add_c2(a, 7, 3, c2, c3, c1);
975   r[10] = c2;
976   c2 = 0;
977   sqr_add_c2(a, 7, 4, c3, c1, c2);
978   sqr_add_c2(a, 6, 5, c3, c1, c2);
979   r[11] = c3;
980   c3 = 0;
981   sqr_add_c(a, 6, c1, c2, c3);
982   sqr_add_c2(a, 7, 5, c1, c2, c3);
983   r[12] = c1;
984   c1 = 0;
985   sqr_add_c2(a, 7, 6, c2, c3, c1);
986   r[13] = c2;
987   c2 = 0;
988   sqr_add_c(a, 7, c3, c1, c2);
989   r[14] = c3;
990   r[15] = c1;
991 }
992 
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)993 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
994   BN_ULONG c1, c2, c3;
995 
996   c1 = 0;
997   c2 = 0;
998   c3 = 0;
999   sqr_add_c(a, 0, c1, c2, c3);
1000   r[0] = c1;
1001   c1 = 0;
1002   sqr_add_c2(a, 1, 0, c2, c3, c1);
1003   r[1] = c2;
1004   c2 = 0;
1005   sqr_add_c(a, 1, c3, c1, c2);
1006   sqr_add_c2(a, 2, 0, c3, c1, c2);
1007   r[2] = c3;
1008   c3 = 0;
1009   sqr_add_c2(a, 3, 0, c1, c2, c3);
1010   sqr_add_c2(a, 2, 1, c1, c2, c3);
1011   r[3] = c1;
1012   c1 = 0;
1013   sqr_add_c(a, 2, c2, c3, c1);
1014   sqr_add_c2(a, 3, 1, c2, c3, c1);
1015   r[4] = c2;
1016   c2 = 0;
1017   sqr_add_c2(a, 3, 2, c3, c1, c2);
1018   r[5] = c3;
1019   c3 = 0;
1020   sqr_add_c(a, 3, c1, c2, c3);
1021   r[6] = c1;
1022   r[7] = c2;
1023 }
1024 
1025 #if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
1026 /* This is essentially reference implementation, which may or may not
1027  * result in performance improvement. E.g. on IA-32 this routine was
1028  * observed to give 40% faster rsa1024 private key operations and 10%
1029  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
1030  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
1031  * reference implementation, one to be used as starting point for
1032  * platform-specific assembler. Mentioned numbers apply to compiler
1033  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
1034  * can vary not only from platform to platform, but even for compiler
1035  * versions. Assembler vs. assembler improvement coefficients can
1036  * [and are known to] differ and are to be documented elsewhere. */
bn_mul_mont(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,const BN_ULONG * np,const BN_ULONG * n0p,int num)1037 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1038                 const BN_ULONG *np, const BN_ULONG *n0p, int num) {
1039   BN_ULONG c0, c1, ml, *tp, n0;
1040 #ifdef mul64
1041   BN_ULONG mh;
1042 #endif
1043   volatile BN_ULONG *vp;
1044   int i = 0, j;
1045 
1046 #if 0 /* template for platform-specific implementation */
1047 	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
1048 #endif
1049   vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1050 
1051   n0 = *n0p;
1052 
1053   c0 = 0;
1054   ml = bp[0];
1055 #ifdef mul64
1056   mh = HBITS(ml);
1057   ml = LBITS(ml);
1058   for (j = 0; j < num; ++j) {
1059     mul(tp[j], ap[j], ml, mh, c0);
1060   }
1061 #else
1062   for (j = 0; j < num; ++j) {
1063     mul(tp[j], ap[j], ml, c0);
1064   }
1065 #endif
1066 
1067   tp[num] = c0;
1068   tp[num + 1] = 0;
1069   goto enter;
1070 
1071   for (i = 0; i < num; i++) {
1072     c0 = 0;
1073     ml = bp[i];
1074 #ifdef mul64
1075     mh = HBITS(ml);
1076     ml = LBITS(ml);
1077     for (j = 0; j < num; ++j) {
1078       mul_add(tp[j], ap[j], ml, mh, c0);
1079     }
1080 #else
1081     for (j = 0; j < num; ++j) {
1082       mul_add(tp[j], ap[j], ml, c0);
1083     }
1084 #endif
1085     c1 = (tp[num] + c0) & BN_MASK2;
1086     tp[num] = c1;
1087     tp[num + 1] = (c1 < c0 ? 1 : 0);
1088   enter:
1089     c1 = tp[0];
1090     ml = (c1 * n0) & BN_MASK2;
1091     c0 = 0;
1092 #ifdef mul64
1093     mh = HBITS(ml);
1094     ml = LBITS(ml);
1095     mul_add(c1, np[0], ml, mh, c0);
1096 #else
1097     mul_add(c1, ml, np[0], c0);
1098 #endif
1099     for (j = 1; j < num; j++) {
1100       c1 = tp[j];
1101 #ifdef mul64
1102       mul_add(c1, np[j], ml, mh, c0);
1103 #else
1104       mul_add(c1, ml, np[j], c0);
1105 #endif
1106       tp[j - 1] = c1 & BN_MASK2;
1107     }
1108     c1 = (tp[num] + c0) & BN_MASK2;
1109     tp[num - 1] = c1;
1110     tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
1111   }
1112 
1113   if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1114     c0 = bn_sub_words(rp, tp, np, num);
1115     if (tp[num] != 0 || c0 == 0) {
1116       for (i = 0; i < num + 2; i++) {
1117         vp[i] = 0;
1118       }
1119       return 1;
1120     }
1121   }
1122   for (i = 0; i < num; i++) {
1123     rp[i] = tp[i], vp[i] = 0;
1124   }
1125   vp[num] = 0;
1126   vp[num + 1] = 0;
1127   return 1;
1128 }
1129 #endif
1130 
1131 #endif
1132