1 /* x86_64 BIGNUM accelerator version 0.1, December 2002.
2 *
3 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4 * project.
5 *
6 * Rights for redistribution and usage in source and binary forms are
7 * granted according to the OpenSSL license. Warranty of any kind is
8 * disclaimed.
9 *
10 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
11 * versions, like 1.0...
12 * A. Well, that's because this code is basically a quick-n-dirty
13 * proof-of-concept hack. As you can see it's implemented with
14 * inline assembler, which means that you're bound to GCC and that
15 * there might be enough room for further improvement.
16 *
17 * Q. Why inline assembler?
18 * A. x86_64 features own ABI which I'm not familiar with. This is
19 * why I decided to let the compiler take care of subroutine
20 * prologue/epilogue as well as register allocation. For reference.
21 * Win64 implements different ABI for AMD64, different from Linux.
22 *
23 * Q. How much faster does it get?
24 * A. 'apps/openssl speed rsa dsa' output with no-asm:
25 *
26 * sign verify sign/s verify/s
27 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
28 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
29 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
30 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
31 * sign verify sign/s verify/s
32 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
33 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
34 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
35 *
36 * 'apps/openssl speed rsa dsa' output with this module:
37 *
38 * sign verify sign/s verify/s
39 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
40 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
41 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
42 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
43 * sign verify sign/s verify/s
44 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
45 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
46 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
47 *
48 * For the reference. IA-32 assembler implementation performs
49 * very much like 64-bit code compiled with no-asm on the same
50 * machine.
51 */
52
53 #include <openssl/bn.h>
54
55 // TODO(davidben): Get this file working on MSVC x64.
56 #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
57 (defined(__GNUC__) || defined(__clang__))
58
59 #include "../internal.h"
60
61
62 #undef mul
63 #undef mul_add
64
65 // "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
66 // "g"(0) let the compiler to decide where does it
67 // want to keep the value of zero;
68 #define mul_add(r, a, word, carry) \
69 do { \
70 register BN_ULONG high, low; \
71 __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
72 __asm__("addq %2,%0; adcq %3,%1" \
73 : "+r"(carry), "+d"(high) \
74 : "a"(low), "g"(0) \
75 : "cc"); \
76 __asm__("addq %2,%0; adcq %3,%1" \
77 : "+m"(r), "+d"(high) \
78 : "r"(carry), "g"(0) \
79 : "cc"); \
80 (carry) = high; \
81 } while (0)
82
83 #define mul(r, a, word, carry) \
84 do { \
85 register BN_ULONG high, low; \
86 __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
87 __asm__("addq %2,%0; adcq %3,%1" \
88 : "+r"(carry), "+d"(high) \
89 : "a"(low), "g"(0) \
90 : "cc"); \
91 (r) = (carry); \
92 (carry) = high; \
93 } while (0)
94 #undef sqr
95 #define sqr(r0, r1, a) __asm__("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
96
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,size_t num,BN_ULONG w)97 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
98 BN_ULONG w) {
99 BN_ULONG c1 = 0;
100
101 if (num == 0) {
102 return (c1);
103 }
104
105 while (num & ~3) {
106 mul_add(rp[0], ap[0], w, c1);
107 mul_add(rp[1], ap[1], w, c1);
108 mul_add(rp[2], ap[2], w, c1);
109 mul_add(rp[3], ap[3], w, c1);
110 ap += 4;
111 rp += 4;
112 num -= 4;
113 }
114 if (num) {
115 mul_add(rp[0], ap[0], w, c1);
116 if (--num == 0) {
117 return c1;
118 }
119 mul_add(rp[1], ap[1], w, c1);
120 if (--num == 0) {
121 return c1;
122 }
123 mul_add(rp[2], ap[2], w, c1);
124 return c1;
125 }
126
127 return c1;
128 }
129
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,size_t num,BN_ULONG w)130 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
131 BN_ULONG w) {
132 BN_ULONG c1 = 0;
133
134 if (num == 0) {
135 return c1;
136 }
137
138 while (num & ~3) {
139 mul(rp[0], ap[0], w, c1);
140 mul(rp[1], ap[1], w, c1);
141 mul(rp[2], ap[2], w, c1);
142 mul(rp[3], ap[3], w, c1);
143 ap += 4;
144 rp += 4;
145 num -= 4;
146 }
147 if (num) {
148 mul(rp[0], ap[0], w, c1);
149 if (--num == 0) {
150 return c1;
151 }
152 mul(rp[1], ap[1], w, c1);
153 if (--num == 0) {
154 return c1;
155 }
156 mul(rp[2], ap[2], w, c1);
157 }
158 return c1;
159 }
160
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,size_t n)161 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) {
162 if (n == 0) {
163 return;
164 }
165
166 while (n & ~3) {
167 sqr(r[0], r[1], a[0]);
168 sqr(r[2], r[3], a[1]);
169 sqr(r[4], r[5], a[2]);
170 sqr(r[6], r[7], a[3]);
171 a += 4;
172 r += 8;
173 n -= 4;
174 }
175 if (n) {
176 sqr(r[0], r[1], a[0]);
177 if (--n == 0) {
178 return;
179 }
180 sqr(r[2], r[3], a[1]);
181 if (--n == 0) {
182 return;
183 }
184 sqr(r[4], r[5], a[2]);
185 }
186 }
187
bn_add_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,size_t n)188 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
189 size_t n) {
190 BN_ULONG ret;
191 size_t i = 0;
192
193 if (n == 0) {
194 return 0;
195 }
196
197 __asm__ volatile (
198 " subq %0,%0 \n" // clear carry
199 " jmp 1f \n"
200 ".p2align 4 \n"
201 "1:"
202 " movq (%4,%2,8),%0 \n"
203 " adcq (%5,%2,8),%0 \n"
204 " movq %0,(%3,%2,8) \n"
205 " lea 1(%2),%2 \n"
206 " dec %1 \n"
207 " jnz 1b \n"
208 " sbbq %0,%0 \n"
209 : "=&r"(ret), "+c"(n), "+r"(i)
210 : "r"(rp), "r"(ap), "r"(bp)
211 : "cc", "memory");
212
213 return ret & 1;
214 }
215
bn_sub_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,size_t n)216 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
217 size_t n) {
218 BN_ULONG ret;
219 size_t i = 0;
220
221 if (n == 0) {
222 return 0;
223 }
224
225 __asm__ volatile (
226 " subq %0,%0 \n" // clear borrow
227 " jmp 1f \n"
228 ".p2align 4 \n"
229 "1:"
230 " movq (%4,%2,8),%0 \n"
231 " sbbq (%5,%2,8),%0 \n"
232 " movq %0,(%3,%2,8) \n"
233 " lea 1(%2),%2 \n"
234 " dec %1 \n"
235 " jnz 1b \n"
236 " sbbq %0,%0 \n"
237 : "=&r"(ret), "+c"(n), "+r"(i)
238 : "r"(rp), "r"(ap), "r"(bp)
239 : "cc", "memory");
240
241 return ret & 1;
242 }
243
244 // mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0)
245 // mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0)
246 // sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0)
247 // sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
248
249 // Keep in mind that carrying into high part of multiplication result can not
250 // overflow, because it cannot be all-ones.
251 #define mul_add_c(a, b, c0, c1, c2) \
252 do { \
253 BN_ULONG t1, t2; \
254 __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
255 __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \
256 : "+r"(c0), "+r"(c1), "+r"(c2) \
257 : "r"(t1), "r"(t2), "g"(0) \
258 : "cc"); \
259 } while (0)
260
261 #define sqr_add_c(a, i, c0, c1, c2) \
262 do { \
263 BN_ULONG t1, t2; \
264 __asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
265 __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \
266 : "+r"(c0), "+r"(c1), "+r"(c2) \
267 : "r"(t1), "r"(t2), "g"(0) \
268 : "cc"); \
269 } while (0)
270
271 #define mul_add_c2(a, b, c0, c1, c2) \
272 do { \
273 BN_ULONG t1, t2; \
274 __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
275 __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \
276 : "+r"(c0), "+r"(c1), "+r"(c2) \
277 : "r"(t1), "r"(t2), "g"(0) \
278 : "cc"); \
279 __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \
280 : "+r"(c0), "+r"(c1), "+r"(c2) \
281 : "r"(t1), "r"(t2), "g"(0) \
282 : "cc"); \
283 } while (0)
284
285 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
286
bn_mul_comba8(BN_ULONG r[16],const BN_ULONG a[8],const BN_ULONG b[8])287 void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) {
288 BN_ULONG c1, c2, c3;
289
290 c1 = 0;
291 c2 = 0;
292 c3 = 0;
293 mul_add_c(a[0], b[0], c1, c2, c3);
294 r[0] = c1;
295 c1 = 0;
296 mul_add_c(a[0], b[1], c2, c3, c1);
297 mul_add_c(a[1], b[0], c2, c3, c1);
298 r[1] = c2;
299 c2 = 0;
300 mul_add_c(a[2], b[0], c3, c1, c2);
301 mul_add_c(a[1], b[1], c3, c1, c2);
302 mul_add_c(a[0], b[2], c3, c1, c2);
303 r[2] = c3;
304 c3 = 0;
305 mul_add_c(a[0], b[3], c1, c2, c3);
306 mul_add_c(a[1], b[2], c1, c2, c3);
307 mul_add_c(a[2], b[1], c1, c2, c3);
308 mul_add_c(a[3], b[0], c1, c2, c3);
309 r[3] = c1;
310 c1 = 0;
311 mul_add_c(a[4], b[0], c2, c3, c1);
312 mul_add_c(a[3], b[1], c2, c3, c1);
313 mul_add_c(a[2], b[2], c2, c3, c1);
314 mul_add_c(a[1], b[3], c2, c3, c1);
315 mul_add_c(a[0], b[4], c2, c3, c1);
316 r[4] = c2;
317 c2 = 0;
318 mul_add_c(a[0], b[5], c3, c1, c2);
319 mul_add_c(a[1], b[4], c3, c1, c2);
320 mul_add_c(a[2], b[3], c3, c1, c2);
321 mul_add_c(a[3], b[2], c3, c1, c2);
322 mul_add_c(a[4], b[1], c3, c1, c2);
323 mul_add_c(a[5], b[0], c3, c1, c2);
324 r[5] = c3;
325 c3 = 0;
326 mul_add_c(a[6], b[0], c1, c2, c3);
327 mul_add_c(a[5], b[1], c1, c2, c3);
328 mul_add_c(a[4], b[2], c1, c2, c3);
329 mul_add_c(a[3], b[3], c1, c2, c3);
330 mul_add_c(a[2], b[4], c1, c2, c3);
331 mul_add_c(a[1], b[5], c1, c2, c3);
332 mul_add_c(a[0], b[6], c1, c2, c3);
333 r[6] = c1;
334 c1 = 0;
335 mul_add_c(a[0], b[7], c2, c3, c1);
336 mul_add_c(a[1], b[6], c2, c3, c1);
337 mul_add_c(a[2], b[5], c2, c3, c1);
338 mul_add_c(a[3], b[4], c2, c3, c1);
339 mul_add_c(a[4], b[3], c2, c3, c1);
340 mul_add_c(a[5], b[2], c2, c3, c1);
341 mul_add_c(a[6], b[1], c2, c3, c1);
342 mul_add_c(a[7], b[0], c2, c3, c1);
343 r[7] = c2;
344 c2 = 0;
345 mul_add_c(a[7], b[1], c3, c1, c2);
346 mul_add_c(a[6], b[2], c3, c1, c2);
347 mul_add_c(a[5], b[3], c3, c1, c2);
348 mul_add_c(a[4], b[4], c3, c1, c2);
349 mul_add_c(a[3], b[5], c3, c1, c2);
350 mul_add_c(a[2], b[6], c3, c1, c2);
351 mul_add_c(a[1], b[7], c3, c1, c2);
352 r[8] = c3;
353 c3 = 0;
354 mul_add_c(a[2], b[7], c1, c2, c3);
355 mul_add_c(a[3], b[6], c1, c2, c3);
356 mul_add_c(a[4], b[5], c1, c2, c3);
357 mul_add_c(a[5], b[4], c1, c2, c3);
358 mul_add_c(a[6], b[3], c1, c2, c3);
359 mul_add_c(a[7], b[2], c1, c2, c3);
360 r[9] = c1;
361 c1 = 0;
362 mul_add_c(a[7], b[3], c2, c3, c1);
363 mul_add_c(a[6], b[4], c2, c3, c1);
364 mul_add_c(a[5], b[5], c2, c3, c1);
365 mul_add_c(a[4], b[6], c2, c3, c1);
366 mul_add_c(a[3], b[7], c2, c3, c1);
367 r[10] = c2;
368 c2 = 0;
369 mul_add_c(a[4], b[7], c3, c1, c2);
370 mul_add_c(a[5], b[6], c3, c1, c2);
371 mul_add_c(a[6], b[5], c3, c1, c2);
372 mul_add_c(a[7], b[4], c3, c1, c2);
373 r[11] = c3;
374 c3 = 0;
375 mul_add_c(a[7], b[5], c1, c2, c3);
376 mul_add_c(a[6], b[6], c1, c2, c3);
377 mul_add_c(a[5], b[7], c1, c2, c3);
378 r[12] = c1;
379 c1 = 0;
380 mul_add_c(a[6], b[7], c2, c3, c1);
381 mul_add_c(a[7], b[6], c2, c3, c1);
382 r[13] = c2;
383 c2 = 0;
384 mul_add_c(a[7], b[7], c3, c1, c2);
385 r[14] = c3;
386 r[15] = c1;
387 }
388
bn_mul_comba4(BN_ULONG r[8],const BN_ULONG a[4],const BN_ULONG b[4])389 void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) {
390 BN_ULONG c1, c2, c3;
391
392 c1 = 0;
393 c2 = 0;
394 c3 = 0;
395 mul_add_c(a[0], b[0], c1, c2, c3);
396 r[0] = c1;
397 c1 = 0;
398 mul_add_c(a[0], b[1], c2, c3, c1);
399 mul_add_c(a[1], b[0], c2, c3, c1);
400 r[1] = c2;
401 c2 = 0;
402 mul_add_c(a[2], b[0], c3, c1, c2);
403 mul_add_c(a[1], b[1], c3, c1, c2);
404 mul_add_c(a[0], b[2], c3, c1, c2);
405 r[2] = c3;
406 c3 = 0;
407 mul_add_c(a[0], b[3], c1, c2, c3);
408 mul_add_c(a[1], b[2], c1, c2, c3);
409 mul_add_c(a[2], b[1], c1, c2, c3);
410 mul_add_c(a[3], b[0], c1, c2, c3);
411 r[3] = c1;
412 c1 = 0;
413 mul_add_c(a[3], b[1], c2, c3, c1);
414 mul_add_c(a[2], b[2], c2, c3, c1);
415 mul_add_c(a[1], b[3], c2, c3, c1);
416 r[4] = c2;
417 c2 = 0;
418 mul_add_c(a[2], b[3], c3, c1, c2);
419 mul_add_c(a[3], b[2], c3, c1, c2);
420 r[5] = c3;
421 c3 = 0;
422 mul_add_c(a[3], b[3], c1, c2, c3);
423 r[6] = c1;
424 r[7] = c2;
425 }
426
bn_sqr_comba8(BN_ULONG r[16],const BN_ULONG a[8])427 void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
428 BN_ULONG c1, c2, c3;
429
430 c1 = 0;
431 c2 = 0;
432 c3 = 0;
433 sqr_add_c(a, 0, c1, c2, c3);
434 r[0] = c1;
435 c1 = 0;
436 sqr_add_c2(a, 1, 0, c2, c3, c1);
437 r[1] = c2;
438 c2 = 0;
439 sqr_add_c(a, 1, c3, c1, c2);
440 sqr_add_c2(a, 2, 0, c3, c1, c2);
441 r[2] = c3;
442 c3 = 0;
443 sqr_add_c2(a, 3, 0, c1, c2, c3);
444 sqr_add_c2(a, 2, 1, c1, c2, c3);
445 r[3] = c1;
446 c1 = 0;
447 sqr_add_c(a, 2, c2, c3, c1);
448 sqr_add_c2(a, 3, 1, c2, c3, c1);
449 sqr_add_c2(a, 4, 0, c2, c3, c1);
450 r[4] = c2;
451 c2 = 0;
452 sqr_add_c2(a, 5, 0, c3, c1, c2);
453 sqr_add_c2(a, 4, 1, c3, c1, c2);
454 sqr_add_c2(a, 3, 2, c3, c1, c2);
455 r[5] = c3;
456 c3 = 0;
457 sqr_add_c(a, 3, c1, c2, c3);
458 sqr_add_c2(a, 4, 2, c1, c2, c3);
459 sqr_add_c2(a, 5, 1, c1, c2, c3);
460 sqr_add_c2(a, 6, 0, c1, c2, c3);
461 r[6] = c1;
462 c1 = 0;
463 sqr_add_c2(a, 7, 0, c2, c3, c1);
464 sqr_add_c2(a, 6, 1, c2, c3, c1);
465 sqr_add_c2(a, 5, 2, c2, c3, c1);
466 sqr_add_c2(a, 4, 3, c2, c3, c1);
467 r[7] = c2;
468 c2 = 0;
469 sqr_add_c(a, 4, c3, c1, c2);
470 sqr_add_c2(a, 5, 3, c3, c1, c2);
471 sqr_add_c2(a, 6, 2, c3, c1, c2);
472 sqr_add_c2(a, 7, 1, c3, c1, c2);
473 r[8] = c3;
474 c3 = 0;
475 sqr_add_c2(a, 7, 2, c1, c2, c3);
476 sqr_add_c2(a, 6, 3, c1, c2, c3);
477 sqr_add_c2(a, 5, 4, c1, c2, c3);
478 r[9] = c1;
479 c1 = 0;
480 sqr_add_c(a, 5, c2, c3, c1);
481 sqr_add_c2(a, 6, 4, c2, c3, c1);
482 sqr_add_c2(a, 7, 3, c2, c3, c1);
483 r[10] = c2;
484 c2 = 0;
485 sqr_add_c2(a, 7, 4, c3, c1, c2);
486 sqr_add_c2(a, 6, 5, c3, c1, c2);
487 r[11] = c3;
488 c3 = 0;
489 sqr_add_c(a, 6, c1, c2, c3);
490 sqr_add_c2(a, 7, 5, c1, c2, c3);
491 r[12] = c1;
492 c1 = 0;
493 sqr_add_c2(a, 7, 6, c2, c3, c1);
494 r[13] = c2;
495 c2 = 0;
496 sqr_add_c(a, 7, c3, c1, c2);
497 r[14] = c3;
498 r[15] = c1;
499 }
500
bn_sqr_comba4(BN_ULONG r[8],const BN_ULONG a[4])501 void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
502 BN_ULONG c1, c2, c3;
503
504 c1 = 0;
505 c2 = 0;
506 c3 = 0;
507 sqr_add_c(a, 0, c1, c2, c3);
508 r[0] = c1;
509 c1 = 0;
510 sqr_add_c2(a, 1, 0, c2, c3, c1);
511 r[1] = c2;
512 c2 = 0;
513 sqr_add_c(a, 1, c3, c1, c2);
514 sqr_add_c2(a, 2, 0, c3, c1, c2);
515 r[2] = c3;
516 c3 = 0;
517 sqr_add_c2(a, 3, 0, c1, c2, c3);
518 sqr_add_c2(a, 2, 1, c1, c2, c3);
519 r[3] = c1;
520 c1 = 0;
521 sqr_add_c(a, 2, c2, c3, c1);
522 sqr_add_c2(a, 3, 1, c2, c3, c1);
523 r[4] = c2;
524 c2 = 0;
525 sqr_add_c2(a, 3, 2, c3, c1, c2);
526 r[5] = c3;
527 c3 = 0;
528 sqr_add_c(a, 3, c1, c2, c3);
529 r[6] = c1;
530 r[7] = c2;
531 }
532
533 #undef mul_add
534 #undef mul
535 #undef sqr
536 #undef mul_add_c
537 #undef sqr_add_c
538 #undef mul_add_c2
539 #undef sqr_add_c2
540
541 #endif // !NO_ASM && X86_64 && (__GNUC__ || __clang__)
542