1 /* x86_64 BIGNUM accelerator version 0.1, December 2002.
2 *
3 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project.
5 *
6 * Rights for redistribution and usage in source and binary forms are
7 * granted according to the OpenSSL license. Warranty of any kind is
8 * disclaimed.
9 *
10 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
11 * versions, like 1.0...
12 * A. Well, that's because this code is basically a quick-n-dirty
13 * proof-of-concept hack. As you can see it's implemented with
14 * inline assembler, which means that you're bound to GCC and that
15 * there might be enough room for further improvement.
16 *
17 * Q. Why inline assembler?
18 * A. x86_64 features own ABI which I'm not familiar with. This is
19 * why I decided to let the compiler take care of subroutine
20 * prologue/epilogue as well as register allocation. For reference.
21 * Win64 implements different ABI for AMD64, different from Linux.
22 *
23 * Q. How much faster does it get?
24 * A. 'apps/openssl speed rsa dsa' output with no-asm:
25 *
26 * sign verify sign/s verify/s
27 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
28 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
29 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
30 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
31 * sign verify sign/s verify/s
32 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
33 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
34 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
35 *
36 * 'apps/openssl speed rsa dsa' output with this module:
37 *
38 * sign verify sign/s verify/s
39 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
40 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
41 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
42 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
43 * sign verify sign/s verify/s
44 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
45 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
46 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
47 *
48 * For the reference. IA-32 assembler implementation performs
49 * very much like 64-bit code compiled with no-asm on the same
50 * machine.
51 */
52
53 #include <openssl/bn.h>
54
55 /* TODO(davidben): Get this file working on Windows x64. */
56 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
57
58 #include "../internal.h"
59
60
61 #undef mul
62 #undef mul_add
63
64 #define asm __asm__
65
66 /*
67 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
68 * "g"(0) let the compiler to decide where does it
69 * want to keep the value of zero;
70 */
71 #define mul_add(r, a, word, carry) \
72 do { \
73 register BN_ULONG high, low; \
74 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
75 asm("addq %2,%0; adcq %3,%1" \
76 : "+r"(carry), "+d"(high) \
77 : "a"(low), "g"(0) \
78 : "cc"); \
79 asm("addq %2,%0; adcq %3,%1" \
80 : "+m"(r), "+d"(high) \
81 : "r"(carry), "g"(0) \
82 : "cc"); \
83 (carry) = high; \
84 } while (0)
85
86 #define mul(r, a, word, carry) \
87 do { \
88 register BN_ULONG high, low; \
89 asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
90 asm("addq %2,%0; adcq %3,%1" \
91 : "+r"(carry), "+d"(high) \
92 : "a"(low), "g"(0) \
93 : "cc"); \
94 (r) = (carry); \
95 (carry) = high; \
96 } while (0)
97 #undef sqr
98 #define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
99
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)100 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
101 BN_ULONG w) {
102 BN_ULONG c1 = 0;
103
104 if (num <= 0) {
105 return (c1);
106 }
107
108 while (num & ~3) {
109 mul_add(rp[0], ap[0], w, c1);
110 mul_add(rp[1], ap[1], w, c1);
111 mul_add(rp[2], ap[2], w, c1);
112 mul_add(rp[3], ap[3], w, c1);
113 ap += 4;
114 rp += 4;
115 num -= 4;
116 }
117 if (num) {
118 mul_add(rp[0], ap[0], w, c1);
119 if (--num == 0) {
120 return c1;
121 }
122 mul_add(rp[1], ap[1], w, c1);
123 if (--num == 0) {
124 return c1;
125 }
126 mul_add(rp[2], ap[2], w, c1);
127 return c1;
128 }
129
130 return c1;
131 }
132
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)133 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
134 BN_ULONG c1 = 0;
135
136 if (num <= 0) {
137 return c1;
138 }
139
140 while (num & ~3) {
141 mul(rp[0], ap[0], w, c1);
142 mul(rp[1], ap[1], w, c1);
143 mul(rp[2], ap[2], w, c1);
144 mul(rp[3], ap[3], w, c1);
145 ap += 4;
146 rp += 4;
147 num -= 4;
148 }
149 if (num) {
150 mul(rp[0], ap[0], w, c1);
151 if (--num == 0) {
152 return c1;
153 }
154 mul(rp[1], ap[1], w, c1);
155 if (--num == 0) {
156 return c1;
157 }
158 mul(rp[2], ap[2], w, c1);
159 }
160 return c1;
161 }
162
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)163 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
164 if (n <= 0) {
165 return;
166 }
167
168 while (n & ~3) {
169 sqr(r[0], r[1], a[0]);
170 sqr(r[2], r[3], a[1]);
171 sqr(r[4], r[5], a[2]);
172 sqr(r[6], r[7], a[3]);
173 a += 4;
174 r += 8;
175 n -= 4;
176 }
177 if (n) {
178 sqr(r[0], r[1], a[0]);
179 if (--n == 0) {
180 return;
181 }
182 sqr(r[2], r[3], a[1]);
183 if (--n == 0) {
184 return;
185 }
186 sqr(r[4], r[5], a[2]);
187 }
188 }
189
bn_add_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)190 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
191 int n) {
192 BN_ULONG ret;
193 size_t i = 0;
194
195 if (n <= 0) {
196 return 0;
197 }
198
199 asm volatile (
200 " subq %0,%0 \n" /* clear carry */
201 " jmp 1f \n"
202 ".p2align 4 \n"
203 "1:"
204 " movq (%4,%2,8),%0 \n"
205 " adcq (%5,%2,8),%0 \n"
206 " movq %0,(%3,%2,8) \n"
207 " lea 1(%2),%2 \n"
208 " loop 1b \n"
209 " sbbq %0,%0 \n"
210 : "=&r"(ret), "+c"(n), "+r"(i)
211 : "r"(rp), "r"(ap), "r"(bp)
212 : "cc", "memory");
213
214 return ret & 1;
215 }
216
bn_sub_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)217 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
218 int n) {
219 BN_ULONG ret;
220 size_t i = 0;
221
222 if (n <= 0) {
223 return 0;
224 }
225
226 asm volatile (
227 " subq %0,%0 \n" /* clear borrow */
228 " jmp 1f \n"
229 ".p2align 4 \n"
230 "1:"
231 " movq (%4,%2,8),%0 \n"
232 " sbbq (%5,%2,8),%0 \n"
233 " movq %0,(%3,%2,8) \n"
234 " lea 1(%2),%2 \n"
235 " loop 1b \n"
236 " sbbq %0,%0 \n"
237 : "=&r"(ret), "+c"(n), "+r"(i)
238 : "r"(rp), "r"(ap), "r"(bp)
239 : "cc", "memory");
240
241 return ret & 1;
242 }
243
244 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
245 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
246 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
247 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
248 */
249
250 /* Keep in mind that carrying into high part of multiplication result can not
251 * overflow, because it cannot be all-ones. */
252 #define mul_add_c(a, b, c0, c1, c2) \
253 do { \
254 BN_ULONG t1, t2; \
255 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
256 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
257 : "+r"(c0), "+r"(c1), "+r"(c2) \
258 : "r"(t1), "r"(t2), "g"(0) \
259 : "cc"); \
260 } while (0)
261
262 #define sqr_add_c(a, i, c0, c1, c2) \
263 do { \
264 BN_ULONG t1, t2; \
265 asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
266 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
267 : "+r"(c0), "+r"(c1), "+r"(c2) \
268 : "r"(t1), "r"(t2), "g"(0) \
269 : "cc"); \
270 } while (0)
271
272 #define mul_add_c2(a, b, c0, c1, c2) \
273 do { \
274 BN_ULONG t1, t2; \
275 asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
276 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
277 : "+r"(c0), "+r"(c1), "+r"(c2) \
278 : "r"(t1), "r"(t2), "g"(0) \
279 : "cc"); \
280 asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
281 : "+r"(c0), "+r"(c1), "+r"(c2) \
282 : "r"(t1), "r"(t2), "g"(0) \
283 : "cc"); \
284 } while (0)
285
286 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
287
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)288 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
289 BN_ULONG c1, c2, c3;
290
291 c1 = 0;
292 c2 = 0;
293 c3 = 0;
294 mul_add_c(a[0], b[0], c1, c2, c3);
295 r[0] = c1;
296 c1 = 0;
297 mul_add_c(a[0], b[1], c2, c3, c1);
298 mul_add_c(a[1], b[0], c2, c3, c1);
299 r[1] = c2;
300 c2 = 0;
301 mul_add_c(a[2], b[0], c3, c1, c2);
302 mul_add_c(a[1], b[1], c3, c1, c2);
303 mul_add_c(a[0], b[2], c3, c1, c2);
304 r[2] = c3;
305 c3 = 0;
306 mul_add_c(a[0], b[3], c1, c2, c3);
307 mul_add_c(a[1], b[2], c1, c2, c3);
308 mul_add_c(a[2], b[1], c1, c2, c3);
309 mul_add_c(a[3], b[0], c1, c2, c3);
310 r[3] = c1;
311 c1 = 0;
312 mul_add_c(a[4], b[0], c2, c3, c1);
313 mul_add_c(a[3], b[1], c2, c3, c1);
314 mul_add_c(a[2], b[2], c2, c3, c1);
315 mul_add_c(a[1], b[3], c2, c3, c1);
316 mul_add_c(a[0], b[4], c2, c3, c1);
317 r[4] = c2;
318 c2 = 0;
319 mul_add_c(a[0], b[5], c3, c1, c2);
320 mul_add_c(a[1], b[4], c3, c1, c2);
321 mul_add_c(a[2], b[3], c3, c1, c2);
322 mul_add_c(a[3], b[2], c3, c1, c2);
323 mul_add_c(a[4], b[1], c3, c1, c2);
324 mul_add_c(a[5], b[0], c3, c1, c2);
325 r[5] = c3;
326 c3 = 0;
327 mul_add_c(a[6], b[0], c1, c2, c3);
328 mul_add_c(a[5], b[1], c1, c2, c3);
329 mul_add_c(a[4], b[2], c1, c2, c3);
330 mul_add_c(a[3], b[3], c1, c2, c3);
331 mul_add_c(a[2], b[4], c1, c2, c3);
332 mul_add_c(a[1], b[5], c1, c2, c3);
333 mul_add_c(a[0], b[6], c1, c2, c3);
334 r[6] = c1;
335 c1 = 0;
336 mul_add_c(a[0], b[7], c2, c3, c1);
337 mul_add_c(a[1], b[6], c2, c3, c1);
338 mul_add_c(a[2], b[5], c2, c3, c1);
339 mul_add_c(a[3], b[4], c2, c3, c1);
340 mul_add_c(a[4], b[3], c2, c3, c1);
341 mul_add_c(a[5], b[2], c2, c3, c1);
342 mul_add_c(a[6], b[1], c2, c3, c1);
343 mul_add_c(a[7], b[0], c2, c3, c1);
344 r[7] = c2;
345 c2 = 0;
346 mul_add_c(a[7], b[1], c3, c1, c2);
347 mul_add_c(a[6], b[2], c3, c1, c2);
348 mul_add_c(a[5], b[3], c3, c1, c2);
349 mul_add_c(a[4], b[4], c3, c1, c2);
350 mul_add_c(a[3], b[5], c3, c1, c2);
351 mul_add_c(a[2], b[6], c3, c1, c2);
352 mul_add_c(a[1], b[7], c3, c1, c2);
353 r[8] = c3;
354 c3 = 0;
355 mul_add_c(a[2], b[7], c1, c2, c3);
356 mul_add_c(a[3], b[6], c1, c2, c3);
357 mul_add_c(a[4], b[5], c1, c2, c3);
358 mul_add_c(a[5], b[4], c1, c2, c3);
359 mul_add_c(a[6], b[3], c1, c2, c3);
360 mul_add_c(a[7], b[2], c1, c2, c3);
361 r[9] = c1;
362 c1 = 0;
363 mul_add_c(a[7], b[3], c2, c3, c1);
364 mul_add_c(a[6], b[4], c2, c3, c1);
365 mul_add_c(a[5], b[5], c2, c3, c1);
366 mul_add_c(a[4], b[6], c2, c3, c1);
367 mul_add_c(a[3], b[7], c2, c3, c1);
368 r[10] = c2;
369 c2 = 0;
370 mul_add_c(a[4], b[7], c3, c1, c2);
371 mul_add_c(a[5], b[6], c3, c1, c2);
372 mul_add_c(a[6], b[5], c3, c1, c2);
373 mul_add_c(a[7], b[4], c3, c1, c2);
374 r[11] = c3;
375 c3 = 0;
376 mul_add_c(a[7], b[5], c1, c2, c3);
377 mul_add_c(a[6], b[6], c1, c2, c3);
378 mul_add_c(a[5], b[7], c1, c2, c3);
379 r[12] = c1;
380 c1 = 0;
381 mul_add_c(a[6], b[7], c2, c3, c1);
382 mul_add_c(a[7], b[6], c2, c3, c1);
383 r[13] = c2;
384 c2 = 0;
385 mul_add_c(a[7], b[7], c3, c1, c2);
386 r[14] = c3;
387 r[15] = c1;
388 }
389
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)390 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
391 BN_ULONG c1, c2, c3;
392
393 c1 = 0;
394 c2 = 0;
395 c3 = 0;
396 mul_add_c(a[0], b[0], c1, c2, c3);
397 r[0] = c1;
398 c1 = 0;
399 mul_add_c(a[0], b[1], c2, c3, c1);
400 mul_add_c(a[1], b[0], c2, c3, c1);
401 r[1] = c2;
402 c2 = 0;
403 mul_add_c(a[2], b[0], c3, c1, c2);
404 mul_add_c(a[1], b[1], c3, c1, c2);
405 mul_add_c(a[0], b[2], c3, c1, c2);
406 r[2] = c3;
407 c3 = 0;
408 mul_add_c(a[0], b[3], c1, c2, c3);
409 mul_add_c(a[1], b[2], c1, c2, c3);
410 mul_add_c(a[2], b[1], c1, c2, c3);
411 mul_add_c(a[3], b[0], c1, c2, c3);
412 r[3] = c1;
413 c1 = 0;
414 mul_add_c(a[3], b[1], c2, c3, c1);
415 mul_add_c(a[2], b[2], c2, c3, c1);
416 mul_add_c(a[1], b[3], c2, c3, c1);
417 r[4] = c2;
418 c2 = 0;
419 mul_add_c(a[2], b[3], c3, c1, c2);
420 mul_add_c(a[3], b[2], c3, c1, c2);
421 r[5] = c3;
422 c3 = 0;
423 mul_add_c(a[3], b[3], c1, c2, c3);
424 r[6] = c1;
425 r[7] = c2;
426 }
427
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)428 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
429 BN_ULONG c1, c2, c3;
430
431 c1 = 0;
432 c2 = 0;
433 c3 = 0;
434 sqr_add_c(a, 0, c1, c2, c3);
435 r[0] = c1;
436 c1 = 0;
437 sqr_add_c2(a, 1, 0, c2, c3, c1);
438 r[1] = c2;
439 c2 = 0;
440 sqr_add_c(a, 1, c3, c1, c2);
441 sqr_add_c2(a, 2, 0, c3, c1, c2);
442 r[2] = c3;
443 c3 = 0;
444 sqr_add_c2(a, 3, 0, c1, c2, c3);
445 sqr_add_c2(a, 2, 1, c1, c2, c3);
446 r[3] = c1;
447 c1 = 0;
448 sqr_add_c(a, 2, c2, c3, c1);
449 sqr_add_c2(a, 3, 1, c2, c3, c1);
450 sqr_add_c2(a, 4, 0, c2, c3, c1);
451 r[4] = c2;
452 c2 = 0;
453 sqr_add_c2(a, 5, 0, c3, c1, c2);
454 sqr_add_c2(a, 4, 1, c3, c1, c2);
455 sqr_add_c2(a, 3, 2, c3, c1, c2);
456 r[5] = c3;
457 c3 = 0;
458 sqr_add_c(a, 3, c1, c2, c3);
459 sqr_add_c2(a, 4, 2, c1, c2, c3);
460 sqr_add_c2(a, 5, 1, c1, c2, c3);
461 sqr_add_c2(a, 6, 0, c1, c2, c3);
462 r[6] = c1;
463 c1 = 0;
464 sqr_add_c2(a, 7, 0, c2, c3, c1);
465 sqr_add_c2(a, 6, 1, c2, c3, c1);
466 sqr_add_c2(a, 5, 2, c2, c3, c1);
467 sqr_add_c2(a, 4, 3, c2, c3, c1);
468 r[7] = c2;
469 c2 = 0;
470 sqr_add_c(a, 4, c3, c1, c2);
471 sqr_add_c2(a, 5, 3, c3, c1, c2);
472 sqr_add_c2(a, 6, 2, c3, c1, c2);
473 sqr_add_c2(a, 7, 1, c3, c1, c2);
474 r[8] = c3;
475 c3 = 0;
476 sqr_add_c2(a, 7, 2, c1, c2, c3);
477 sqr_add_c2(a, 6, 3, c1, c2, c3);
478 sqr_add_c2(a, 5, 4, c1, c2, c3);
479 r[9] = c1;
480 c1 = 0;
481 sqr_add_c(a, 5, c2, c3, c1);
482 sqr_add_c2(a, 6, 4, c2, c3, c1);
483 sqr_add_c2(a, 7, 3, c2, c3, c1);
484 r[10] = c2;
485 c2 = 0;
486 sqr_add_c2(a, 7, 4, c3, c1, c2);
487 sqr_add_c2(a, 6, 5, c3, c1, c2);
488 r[11] = c3;
489 c3 = 0;
490 sqr_add_c(a, 6, c1, c2, c3);
491 sqr_add_c2(a, 7, 5, c1, c2, c3);
492 r[12] = c1;
493 c1 = 0;
494 sqr_add_c2(a, 7, 6, c2, c3, c1);
495 r[13] = c2;
496 c2 = 0;
497 sqr_add_c(a, 7, c3, c1, c2);
498 r[14] = c3;
499 r[15] = c1;
500 }
501
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)502 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
503 BN_ULONG c1, c2, c3;
504
505 c1 = 0;
506 c2 = 0;
507 c3 = 0;
508 sqr_add_c(a, 0, c1, c2, c3);
509 r[0] = c1;
510 c1 = 0;
511 sqr_add_c2(a, 1, 0, c2, c3, c1);
512 r[1] = c2;
513 c2 = 0;
514 sqr_add_c(a, 1, c3, c1, c2);
515 sqr_add_c2(a, 2, 0, c3, c1, c2);
516 r[2] = c3;
517 c3 = 0;
518 sqr_add_c2(a, 3, 0, c1, c2, c3);
519 sqr_add_c2(a, 2, 1, c1, c2, c3);
520 r[3] = c1;
521 c1 = 0;
522 sqr_add_c(a, 2, c2, c3, c1);
523 sqr_add_c2(a, 3, 1, c2, c3, c1);
524 r[4] = c2;
525 c2 = 0;
526 sqr_add_c2(a, 3, 2, c3, c1, c2);
527 r[5] = c3;
528 c3 = 0;
529 sqr_add_c(a, 3, c1, c2, c3);
530 r[6] = c1;
531 r[7] = c2;
532 }
533
534 #undef mul_add
535 #undef mul
536 #undef sqr
537 #undef mul_add_c
538 #undef sqr_add_c
539 #undef mul_add_c2
540 #undef sqr_add_c2
541
542 #endif /* !NO_ASM && X86_64 && __GNUC__ */
543