• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
2
3#ifndef _UECC_ASM_AVR_H_
4#define _UECC_ASM_AVR_H_
5
6#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
7    #define uECC_MIN_WORDS 32
8#endif
9#if uECC_SUPPORTS_secp224r1
10    #undef uECC_MIN_WORDS
11    #define uECC_MIN_WORDS 28
12#endif
13#if uECC_SUPPORTS_secp192r1
14    #undef uECC_MIN_WORDS
15    #define uECC_MIN_WORDS 24
16#endif
17#if uECC_SUPPORTS_secp160r1
18    #undef uECC_MIN_WORDS
19    #define uECC_MIN_WORDS 20
20#endif
21
22#if __AVR_HAVE_EIJMP_EICALL__
23    #define IJMP "eijmp \n\t"
24#else
25    #define IJMP "ijmp \n\t"
26#endif
27
28#if (uECC_OPTIMIZATION_LEVEL >= 2)
29
30uECC_VLI_API void uECC_vli_clear(uECC_word_t *vli, wordcount_t num_words) {
31    volatile uECC_word_t *v = vli;
32    __asm__ volatile (
33    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
34        "ldi r30, pm_lo8(1f) \n\t"
35        "ldi r31, pm_hi8(1f) \n\t"
36        "sub r30, %[num] \n\t"
37        "sbc r31, __zero_reg__ \n\t"
38        IJMP
39    #endif
40
41        REPEAT(uECC_MAX_WORDS, "st x+, __zero_reg__ \n\t")
42        "1: \n\t"
43        : "+x" (v)
44        : [num] "r" (num_words)
45        :
46    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
47          "r30", "r31", "cc"
48    #endif
49    );
50}
51#define asm_clear 1
52
53uECC_VLI_API void uECC_vli_set(uECC_word_t *dest, const uECC_word_t *src, wordcount_t num_words) {
54    volatile uECC_word_t *d = dest;
55    __asm__ volatile (
56    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
57        "ldi r30, pm_lo8(1f) \n\t"
58        "ldi r31, pm_hi8(1f) \n\t"
59        "sub r30, %[num] \n\t"
60        "sbc r31, __zero_reg__ \n\t"
61        IJMP
62    #endif
63
64        REPEAT(uECC_MAX_WORDS,
65            "ld r0, y+ \n\t"
66            "st x+, r0 \n\t")
67        "1: \n\t"
68        : "+x" (d), "+y" (src)
69        : [num] "r" ((uint8_t)(num_words * 2))
70        : "r0",
71    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
72          "r30", "r31", "cc"
73    #endif
74    );
75}
76#define asm_set 1
77
78uECC_VLI_API void uECC_vli_rshift1(uECC_word_t *vli, wordcount_t num_words) {
79    volatile uECC_word_t *v = vli;
80    __asm__ volatile (
81    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
82        "ldi r30, pm_lo8(1f) \n\t"
83        "ldi r31, pm_hi8(1f) \n\t"
84        "sub r30, %[jump] \n\t"
85        "sbc r31, __zero_reg__ \n\t"
86    #endif
87
88        "add r26, %[num] \n\t"
89        "adc r27, __zero_reg__ \n\t"
90        "ld r0, -x \n\t"
91        "lsr r0 \n\t"
92        "st x, r0 \n\t"
93    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
94        IJMP
95    #endif
96
97        REPEAT(DEC(uECC_MAX_WORDS),
98            "ld r0, -x \n\t"
99            "ror r0 \n\t"
100            "st x, r0 \n\t")
101        "1: \n\t"
102        : "+x" (v)
103    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
104        : [num] "r" (num_words), [jump] "r" ((uint8_t)(3 * (num_words - 1)))
105        : "r0", "r30", "r31", "cc"
106    #else
107        : [num] "r" (num_words)
108        : "r0", "cc"
109    #endif
110    );
111}
112#define asm_rshift1 1
113
114#define ADD_RJPM_TABLE(N)       \
115    "movw r30, %A[result] \n\t" \
116    "rjmp add_%=_" #N " \n\t"
117
118#define ADD_RJPM_DEST(N)     \
119    "add_%=_" #N ":"         \
120    "ld %[clb], x+ \n\t"     \
121    "ld %[rb], y+ \n\t"      \
122    "adc %[clb], %[rb] \n\t" \
123    "st z+, %[clb] \n\t"
124
125uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
126                                      const uECC_word_t *left,
127                                      const uECC_word_t *right,
128                                      wordcount_t num_words) {
129    volatile uECC_word_t *r = result;
130    uint8_t carry;
131    uint8_t right_byte;
132
133    __asm__ volatile (
134    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
135        "ldi r30, pm_lo8(add_%=_" STR(uECC_MAX_WORDS) ") \n\t"
136        "ldi r31, pm_hi8(add_%=_" STR(uECC_MAX_WORDS) ") \n\t"
137        "sub r30, %[num] \n\t"
138        "sbc r31, __zero_reg__ \n\t"
139    #endif
140
141        "clc \n\t"
142    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
143        IJMP
144        REPEATM(uECC_MAX_WORDS, ADD_RJPM_TABLE)
145    #endif
146
147        REPEATM(uECC_MAX_WORDS, ADD_RJPM_DEST)
148
149        "mov %[clb], __zero_reg__ \n\t"
150        "adc %[clb], %[clb] \n\t" /* Store carry bit. */
151
152        : "+x" (left), "+y" (right),
153          [clb] "=&r" (carry), [rb] "=&r" (right_byte)
154        : [result] "r" (r), [num] "r" ((uint8_t)(num_words * 2))
155        : "r30", "r31", "cc"
156    );
157    return carry;
158}
159#define asm_add 1
160
161#define SUB_RJPM_TABLE(N)       \
162    "movw r30, %A[result] \n\t" \
163    "rjmp sub_%=_" #N " \n\t"
164
165#define SUB_RJPM_DEST(N)     \
166    "sub_%=_" #N ":"         \
167    "ld %[clb], x+ \n\t"     \
168    "ld %[rb], y+ \n\t"      \
169    "sbc %[clb], %[rb] \n\t" \
170    "st z+, %[clb] \n\t"
171
172uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
173                                      const uECC_word_t *left,
174                                      const uECC_word_t *right,
175                                      wordcount_t num_words) {
176    volatile uECC_word_t *r = result;
177    uint8_t carry;
178    uint8_t right_byte;
179
180    __asm__ volatile (
181    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
182        "ldi r30, pm_lo8(sub_%=_" STR(uECC_MAX_WORDS) ") \n\t"
183        "ldi r31, pm_hi8(sub_%=_" STR(uECC_MAX_WORDS) ") \n\t"
184        "sub r30, %[num] \n\t"
185        "sbc r31, __zero_reg__ \n\t"
186    #endif
187
188        "clc \n\t"
189    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
190        IJMP
191        REPEATM(uECC_MAX_WORDS, SUB_RJPM_TABLE)
192    #endif
193
194        REPEATM(uECC_MAX_WORDS, SUB_RJPM_DEST)
195
196        "mov %[clb], __zero_reg__ \n\t"
197        "adc %[clb], %[clb] \n\t" /* Store carry bit. */
198
199        : "+x" (left), "+y" (right),
200          [clb] "=&r" (carry), [rb] "=&r" (right_byte)
201        : [result] "r" (r), [num] "r" ((uint8_t)(num_words * 2))
202        : "r30", "r31", "cc"
203    );
204    return carry;
205}
206#define asm_sub 1
207
208#if (uECC_OPTIMIZATION_LEVEL >= 3)
209
210#include "asm_avr_mult_square.inc"
211
212__attribute((noinline))
213uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
214                                const uECC_word_t *left,
215                                const uECC_word_t *right,
216                                wordcount_t num_words) {
217    /* num_words should already be in r18. */
218    register wordcount_t r18 __asm__("r18") = num_words;
219
220    __asm__ volatile (
221        "push r18 \n\t"
222#if (uECC_MIN_WORDS == 20)
223        FAST_MULT_ASM_20
224        "pop r18 \n\t"
225    #if (uECC_MAX_WORDS > 20)
226        FAST_MULT_ASM_20_TO_24
227    #endif
228    #if (uECC_MAX_WORDS > 24)
229        FAST_MULT_ASM_24_TO_28
230    #endif
231    #if (uECC_MAX_WORDS > 28)
232        FAST_MULT_ASM_28_TO_32
233    #endif
234#elif (uECC_MIN_WORDS == 24)
235        FAST_MULT_ASM_24
236        "pop r18 \n\t"
237    #if (uECC_MAX_WORDS > 24)
238        FAST_MULT_ASM_24_TO_28
239    #endif
240    #if (uECC_MAX_WORDS > 28)
241        FAST_MULT_ASM_28_TO_32
242    #endif
243#elif (uECC_MIN_WORDS == 28)
244        FAST_MULT_ASM_28
245        "pop r18 \n\t"
246    #if (uECC_MAX_WORDS > 28)
247        FAST_MULT_ASM_28_TO_32
248    #endif
249#elif (uECC_MIN_WORDS == 32)
250        FAST_MULT_ASM_32
251        "pop r18 \n\t"
252#endif
253        "2: \n\t"
254        "eor r1, r1 \n\t"
255        : "+x" (left), "+y" (right), "+z" (result)
256        : "r" (r18)
257        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
258          "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r19", "r20",
259          "r21", "r22", "r23", "r24", "r25", "cc"
260    );
261}
262#define asm_mult 1
263
264#if uECC_SQUARE_FUNC
265__attribute((noinline))
266uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
267                                  const uECC_word_t *left,
268                                  wordcount_t num_words) {
269    /* num_words should already be in r20. */
270    register wordcount_t r20 __asm__("r20") = num_words;
271
272    __asm__ volatile (
273        "push r20 \n\t"
274#if (uECC_MIN_WORDS == 20)
275        FAST_SQUARE_ASM_20
276        "pop r20 \n\t"
277    #if (uECC_MAX_WORDS > 20)
278        FAST_SQUARE_ASM_20_TO_24
279    #endif
280    #if (uECC_MAX_WORDS > 24)
281        FAST_SQUARE_ASM_24_TO_28
282    #endif
283    #if (uECC_MAX_WORDS > 28)
284        FAST_SQUARE_ASM_28_TO_32
285    #endif
286#elif (uECC_MIN_WORDS == 24)
287        FAST_SQUARE_ASM_24
288        "pop r20 \n\t"
289    #if (uECC_MAX_WORDS > 24)
290        FAST_SQUARE_ASM_24_TO_28
291    #endif
292    #if (uECC_MAX_WORDS > 28)
293        FAST_SQUARE_ASM_28_TO_32
294    #endif
295#elif (uECC_MIN_WORDS == 28)
296        FAST_SQUARE_ASM_28
297        "pop r20 \n\t"
298    #if (uECC_MAX_WORDS > 28)
299        FAST_SQUARE_ASM_28_TO_32
300    #endif
301#elif (uECC_MIN_WORDS == 32)
302        FAST_SQUARE_ASM_32
303        "pop r20 \n\t"
304#endif
305        "2: \n\t"
306        "eor r1, r1 \n\t"
307        : "+x" (left), "+z" (result)
308        : "r" (r20)
309        : "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
310          "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19",
311          "r21", "r22", "r23", "r24", "r25", "r28", "r29", "cc"
312    );
313}
314#define asm_square 1
315#endif /* uECC_SQUARE_FUNC */
316
317#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
318
319#if uECC_SUPPORTS_secp160r1
320static const struct uECC_Curve_t curve_secp160r1;
321static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
322    uint8_t carry = 0;
323    __asm__ volatile (
324        "in r30, __SP_L__ \n\t"
325    	"in r31, __SP_H__ \n\t"
326    	"sbiw r30, 24 \n\t"
327    	"in r0, __SREG__ \n\t"
328    	"cli \n\t"
329    	"out __SP_H__, r31 \n\t"
330    	"out __SREG__, r0 \n\t"
331    	"out __SP_L__, r30 \n\t"
332
333    	"adiw r30, 25 \n\t" /* we are shifting by 31 bits, so shift over 4 bytes
334    	                       (+ 1 since z initially points below the stack) */
335        "adiw r26, 40 \n\t" /* end of product */
336        "ld r18, -x \n\t"   /* Load word. */
337        "lsr r18 \n\t"      /* Shift. */
338        "st -z, r18 \n\t"   /* Store the first result word. */
339
340        /* Now we just do the remaining words with the carry bit (using ROR) */
341        REPEAT(19,
342            "ld r18, -x \n\t"
343            "ror r18 \n\t"
344            "st -z, r18 \n\t")
345
346        "eor r18, r18 \n\t" /* r18 = 0 */
347        "ror r18 \n\t"      /* get last bit */
348        "st -z, r18 \n\t"   /* store it */
349
350        "sbiw r30, 3 \n\t" /* move z back to point at tmp */
351        /* now we add right */
352        "ld r18, x+ \n\t"
353        "st z+, r18 \n\t" /* the first 3 bytes do not need to be added */
354        "ld r18, x+ \n\t"
355        "st z+, r18 \n\t"
356        "ld r18, x+ \n\t"
357        "st z+, r18 \n\t"
358
359        "ld r18, x+ \n\t"
360        "ld r19, z \n\t"
361        "add r18, r19 \n\t"
362        "st z+, r18 \n\t"
363
364        /* Now we just do the remaining words with the carry bit (using ADC) */
365        REPEAT(16,
366            "ld r18, x+ \n\t"
367            "ld r19, z \n\t"
368            "adc r18, r19 \n\t"
369            "st z+, r18 \n\t")
370
371        /* Propagate over the remaining bytes of result */
372        "ld r18, z \n\t"
373        "adc r18, r1 \n\t"
374        "st z+, r18 \n\t"
375
376        "ld r18, z \n\t"
377        "adc r18, r1 \n\t"
378        "st z+, r18 \n\t"
379
380        "ld r18, z \n\t"
381        "adc r18, r1 \n\t"
382        "st z+, r18 \n\t"
383
384        "ld r18, z \n\t"
385        "adc r18, r1 \n\t"
386        "st z+, r18 \n\t"
387
388        "sbiw r30, 24 \n\t" /* move z back to point at tmp */
389        "sbiw r26, 40 \n\t" /* move x back to point at product */
390
391        /* add low bytes of tmp to product, storing in result */
392        "ld r18, z+ \n\t"
393        "ld r19, x+ \n\t"
394        "add r18, r19 \n\t"
395        "st y+, r18 \n\t"
396        REPEAT(19,
397            "ld r18, z+ \n\t"
398            "ld r19, x+ \n\t"
399            "adc r18, r19 \n\t"
400            "st y+, r18 \n\t")
401        "adc %[carry], __zero_reg__ \n\t" /* Store carry bit (carry flag is cleared). */
402        /* at this point x is at the end of product, y is at the end of result,
403           z is 20 bytes into tmp */
404        "sbiw r28, 20 \n\t" /* move y back to point at result */
405        "adiw r30, 4 \n\t"  /* move z to point to the end of tmp */
406
407        /* do omega_mult again with the 4 relevant bytes */
408        /* z points to the end of tmp, x points to the end of product */
409        "ld r18, -z \n\t" /* Load word. */
410        "lsr r18 \n\t"    /* Shift. */
411        "st -x, r18 \n\t" /* Store the first result word. */
412
413        "ld r18, -z \n\t"
414        "ror r18 \n\t"
415        "st -x, r18 \n\t"
416        "ld r18, -z \n\t"
417        "ror r18 \n\t"
418        "st -x, r18 \n\t"
419        "ld r18, -z \n\t"
420        "ror r18 \n\t"
421        "st -x, r18 \n\t"
422
423        "eor r18, r18 \n\t" /* r18 = 0 */
424        "ror r18 \n\t"      /* get last bit */
425        "st -x, r18 \n\t"   /* store it */
426
427        "sbiw r26, 3 \n\t" /* move x back to point at beginning */
428        /* now we add a copy of the 4 bytes */
429        "ld r18, z+ \n\t"
430        "st x+, r18 \n\t" /* the first 3 bytes do not need to be added */
431        "ld r18, z+ \n\t"
432        "st x+, r18 \n\t"
433        "ld r18, z+ \n\t"
434        "st x+, r18 \n\t"
435
436        "ld r18, z+ \n\t"
437        "ld r19, x \n\t"
438        "add r18, r19 \n\t"
439        "st x+, r18 \n\t"
440
441        /* Propagate over the remaining bytes */
442        "ld r18, x \n\t"
443        "adc r18, r1 \n\t"
444        "st x+, r18 \n\t"
445
446        "ld r18, x \n\t"
447        "adc r18, r1 \n\t"
448        "st x+, r18 \n\t"
449
450        "ld r18, x \n\t"
451        "adc r18, r1 \n\t"
452        "st x+, r18 \n\t"
453
454        "ld r18, x \n\t"
455        "adc r18, r1 \n\t"
456        "st x+, r18 \n\t"
457
458        /* now z points to the end of tmp, x points to the end of product
459           (y still points at result) */
460        "sbiw r26, 8 \n\t" /* move x back to point at beginning of actual data */
461        /* add into result */
462        "ld r18, x+ \n\t"
463        "ld r19, y \n\t"
464        "add r18, r19 \n\t"
465        "st y+, r18 \n\t"
466        REPEAT(7,
467            "ld r18, x+ \n\t"
468            "ld r19, y \n\t"
469            "adc r18, r19 \n\t"
470            "st y+, r18 \n\t")
471
472        /* Done adding, now propagate carry bit */
473        REPEAT(12,
474            "ld r18, y \n\t"
475            "adc r18, __zero_reg__ \n\t"
476            "st y+, r18 \n\t")
477
478        "adc %[carry], __zero_reg__ \n\t" /* Store carry bit (carry flag is cleared). */
479        "sbiw r28, 20 \n\t" /* move y back to point at result */
480
481        "sbiw r30, 1 \n\t" /* fix stack pointer */
482    	"in r0, __SREG__ \n\t"
483    	"cli \n\t"
484    	"out __SP_H__, r31 \n\t"
485    	"out __SREG__, r0 \n\t"
486    	"out __SP_L__, r30 \n\t"
487
488        : "+x" (product), [carry] "+r" (carry)
489        : "y" (result)
490        : "r0", "r18", "r19", "r30", "r31", "cc"
491    );
492
493    if (carry > 0) {
494        --carry;
495        uECC_vli_sub(result, result, curve_secp160r1.p, 20);
496    }
497    if (carry > 0) {
498        uECC_vli_sub(result, result, curve_secp160r1.p, 20);
499    }
500    if (uECC_vli_cmp_unsafe(result, curve_secp160r1.p, 20) > 0) {
501        uECC_vli_sub(result, result, curve_secp160r1.p, 20);
502    }
503}
504#define asm_mmod_fast_secp160r1 1
505#endif /* uECC_SUPPORTS_secp160r1 */
506
507#if uECC_SUPPORTS_secp256k1
508static const struct uECC_Curve_t curve_secp256k1;
509static void vli_mmod_fast_secp256k1(uECC_word_t *result, uECC_word_t *product) {
510    uint8_t carry = 0;
511    __asm__ volatile (
512        "in r30, __SP_L__ \n\t"
513    	"in r31, __SP_H__ \n\t"
514    	"sbiw r30, 37 \n\t"
515    	"in r0, __SREG__ \n\t"
516    	"cli \n\t"
517    	"out __SP_H__, r31 \n\t"
518    	"out __SREG__, r0 \n\t"
519    	"out __SP_L__, r30 \n\t"
520
521    	"adiw r30, 1 \n\t"  /* add 1 since z initially points below the stack */
522        "adiw r26, 32 \n\t" /* product + uECC_WORDS */
523        "ldi r25, 0x03 \n\t"
524        "ldi r24, 0xD1 \n\t"
525        "ld r18, x+ \n\t"
526        "ld r19, x+ \n\t"
527        "ld r20, x+ \n\t"
528        "ld r21, x+ \n\t"
529
530        "mul r24, r18 \n\t"
531        "st z+, r0 \n\t"
532        "mov r22, r1 \n\t"
533        "ldi r23, 0 \n\t"
534
535        "mul r24, r19 \n\t"
536        "add r22, r0 \n\t"
537        "adc r23, r1 \n\t" /* can't overflow */
538        "mul r25, r18 \n\t"
539        "add r22, r0 \n\t"
540        "adc r23, r1 \n\t" /* can't overflow */
541        "st z+, r22 \n\t"
542        "ldi r22, 0 \n\t"
543
544        "mul r24, r20 \n\t"
545        "add r23, r0 \n\t"
546        "adc r22, r1 \n\t"
547        "mul r25, r19 \n\t"
548        "add r23, r0 \n\t"
549        "adc r22, r1 \n\t"
550        "st z+, r23 \n\t"
551        "ldi r23, 0 \n\t"
552
553        "mul r24, r21 \n\t"
554        "add r22, r0 \n\t"
555        "adc r23, r1 \n\t"
556        "mul r25, r20 \n\t"
557        "add r22, r0 \n\t"
558        "adc r23, r1 \n\t"
559        "st z+, r22 \n\t"
560        "ldi r22, 0 \n\t"
561
562        /* now we start adding the 2^32 part as well */
563        "add r23, r18 \n\t" // 28
564        "adc r22, r22 \n\t"
565        "ld r18, x+ \n\t"
566        "mul r24, r18 \n\t"
567        "add r23, r0 \n\t"
568        "adc r22, r1 \n\t"
569        "mul r25, r21 \n\t"
570        "add r23, r0 \n\t"
571        "adc r22, r1 \n\t"
572        "st z+, r23 \n\t"
573        "ldi r23, 0 \n\t"
574
575        "add r22, r19 \n\t" // 27
576        "adc r23, r23 \n\t"
577        "ld r19, x+ \n\t"
578        "mul r24, r19 \n\t"
579        "add r22, r0 \n\t"
580        "adc r23, r1 \n\t"
581        "mul r25, r18 \n\t"
582        "add r22, r0 \n\t"
583        "adc r23, r1 \n\t"
584        "st z+, r22 \n\t"
585        "ldi r22, 0 \n\t"
586
587        REPEAT(6, // 26 - 3
588            "add r23, r20 \n\t"
589            "adc r22, r22 \n\t"
590            "ld r20, x+ \n\t"
591            "mul r24, r20 \n\t"
592            "add r23, r0 \n\t"
593            "adc r22, r1 \n\t"
594            "mul r25, r19 \n\t"
595            "add r23, r0 \n\t"
596            "adc r22, r1 \n\t"
597            "st z+, r23 \n\t"
598            "ldi r23, 0 \n\t"
599
600            "add r22, r21 \n\t"
601            "adc r23, r23 \n\t"
602            "ld r21, x+ \n\t"
603            "mul r24, r21 \n\t"
604            "add r22, r0 \n\t"
605            "adc r23, r1 \n\t"
606            "mul r25, r20 \n\t"
607            "add r22, r0 \n\t"
608            "adc r23, r1 \n\t"
609            "st z+, r22 \n\t"
610            "ldi r22, 0 \n\t"
611
612            "add r23, r18 \n\t"
613            "adc r22, r22 \n\t"
614            "ld r18, x+ \n\t"
615            "mul r24, r18 \n\t"
616            "add r23, r0 \n\t"
617            "adc r22, r1 \n\t"
618            "mul r25, r21 \n\t"
619            "add r23, r0 \n\t"
620            "adc r22, r1 \n\t"
621            "st z+, r23 \n\t"
622            "ldi r23, 0 \n\t"
623
624            "add r22, r19 \n\t"
625            "adc r23, r23 \n\t"
626            "ld r19, x+ \n\t"
627            "mul r24, r19 \n\t"
628            "add r22, r0 \n\t"
629            "adc r23, r1 \n\t"
630            "mul r25, r18 \n\t"
631            "add r22, r0 \n\t"
632            "adc r23, r1 \n\t"
633            "st z+, r22 \n\t"
634            "ldi r22, 0 \n\t")
635
636        "add r23, r20 \n\t" // 2
637        "adc r22, r22 \n\t"
638        "ld r20, x+ \n\t"
639        "mul r24, r20 \n\t"
640        "add r23, r0 \n\t"
641        "adc r22, r1 \n\t"
642        "mul r25, r19 \n\t"
643        "add r23, r0 \n\t"
644        "adc r22, r1 \n\t"
645        "st z+, r23 \n\t"
646        "ldi r23, 0 \n\t"
647
648        "add r22, r21 \n\t" // 1
649        "adc r23, r23 \n\t"
650        "ld r21, x+ \n\t"
651        "mul r24, r21 \n\t"
652        "add r22, r0 \n\t"
653        "adc r23, r1 \n\t"
654        "mul r25, r20 \n\t"
655        "add r22, r0 \n\t"
656        "adc r23, r1 \n\t"
657        "st z+, r22 \n\t"
658        "ldi r22, 0 \n\t"
659
660        /* Now finish the carries etc */
661        "add r23, r18 \n\t"
662        "adc r22, r22 \n\t"
663        "mul r25, r21 \n\t"
664        "add r23, r0 \n\t"
665        "adc r22, r1 \n\t"
666        "st z+, r23 \n\t"
667        "ldi r23, 0 \n\t"
668
669        "add r22, r19 \n\t"
670        "adc r23, r23 \n\t"
671        "st z+, r22 \n\t"
672        "ldi r22, 0 \n\t"
673
674        "add r23, r20 \n\t"
675        "adc r22, r22 \n\t"
676        "st z+, r23 \n\t"
677        "ldi r23, 0 \n\t"
678
679        "add r22, r21 \n\t"
680        "adc r23, r23 \n\t"
681        "st z+, r22 \n\t"
682        "st z+, r23 \n\t"
683        "eor r1, r1 \n\t" /* make r1 be 0 again */
684
685        "sbiw r30, 37 \n\t" /* move z back to point at tmp */
686        "subi r26, 64 \n\t" /* move x back to point at product */
687        "sbc r27, __zero_reg__ \n\t"
688
689        /* add low bytes of tmp to product, storing in result */
690        "ld r18, z+ \n\t"
691        "ld r19, x+ \n\t"
692        "add r18, r19 \n\t"
693        "st y+, r18 \n\t"
694        REPEAT(31,
695            "ld r18, z+ \n\t"
696            "ld r19, x+ \n\t"
697            "adc r18, r19 \n\t"
698            "st y+, r18 \n\t")
699
700        "adc %[carry], __zero_reg__ \n\t" /* Store carry bit (carry flag is cleared). */
701        /* at this point x is at the end of product, y is at the end of result,
702           z is 32 bytes into tmp */
703        "sbiw r28, 32 \n\t" /* move y back to point at result */
704
705        /* do omega_mult again with the 5 relevant bytes */
706        /* z points to tmp + uECC_WORDS, x points to the end of product */
707        "sbiw r26, 32 \n\t" /* shift x back to point into the product buffer
708                               (we can overwrite it now) */
709        "ld r18, z+ \n\t"
710        "ld r19, z+ \n\t"
711        "ld r20, z+ \n\t"
712        "ld r21, z+ \n\t"
713
714        "mul r24, r18 \n\t"
715        "st x+, r0 \n\t"
716        "mov r22, r1 \n\t"
717        "ldi r23, 0 \n\t"
718
719        "mul r24, r19 \n\t"
720        "add r22, r0 \n\t"
721        "adc r23, r1 \n\t" /* can't overflow */
722        "mul r25, r18 \n\t"
723        "add r22, r0 \n\t"
724        "adc r23, r1 \n\t" /* can't overflow */
725        "st x+, r22 \n\t"
726        "ldi r22, 0 \n\t"
727
728        "mul r24, r20 \n\t"
729        "add r23, r0 \n\t"
730        "adc r22, r1 \n\t"
731        "mul r25, r19 \n\t"
732        "add r23, r0 \n\t"
733        "adc r22, r1 \n\t"
734        "st x+, r23 \n\t"
735        "ldi r23, 0 \n\t"
736
737        "mul r24, r21 \n\t"
738        "add r22, r0 \n\t"
739        "adc r23, r1 \n\t"
740        "mul r25, r20 \n\t"
741        "add r22, r0 \n\t"
742        "adc r23, r1 \n\t"
743        "st x+, r22 \n\t"
744        "ldi r22, 0 \n\t"
745
746        "add r23, r18 \n\t"
747        "adc r22, r22 \n\t"
748        "ld r18, z+ \n\t"
749        "mul r24, r18 \n\t"
750        "add r23, r0 \n\t"
751        "adc r22, r1 \n\t"
752        "mul r25, r21 \n\t"
753        "add r23, r0 \n\t"
754        "adc r22, r1 \n\t"
755        "st x+, r23 \n\t"
756        "ldi r23, 0 \n\t"
757
758        /* Now finish the carries etc */
759        "add r22, r19 \n\t"
760        "adc r23, r23 \n\t"
761        "mul r25, r18 \n\t"
762        "add r22, r0 \n\t"
763        "adc r23, r1 \n\t"
764        "st x+, r22 \n\t"
765        "ldi r22, 0 \n\t"
766
767        "add r23, r20 \n\t"
768        "adc r22, r22 \n\t"
769        "st x+, r23 \n\t"
770        "ldi r23, 0 \n\t"
771
772        "add r22, r21 \n\t"
773        "adc r23, r23 \n\t"
774        "st x+, r22 \n\t"
775        "ldi r22, 0 \n\t"
776
777        "add r23, r18 \n\t"
778        "adc r22, r22 \n\t"
779        "st x+, r23 \n\t"
780        "st x+, r22 \n\t"
781        "eor r1, r1 \n\t" /* make r1 be 0 again */
782
783        /* now z points to the end of tmp, x points to the end of product
784           (y still points at result) */
785        "sbiw r26, 10 \n\t" /* move x back to point at beginning of actual data */
786        /* add into result */
787        "ld r18, x+ \n\t"
788        "ld r19, y \n\t"
789        "add r18, r19 \n\t"
790        "st y+, r18 \n\t"
791        REPEAT(9,
792            "ld r18, x+ \n\t"
793            "ld r19, y \n\t"
794            "adc r18, r19 \n\t"
795            "st y+, r18 \n\t")
796
797        /* Done adding, now propagate carry bit */
798        REPEAT(22,
799            "ld r18, y \n\t"
800            "adc r18, __zero_reg__ \n\t"
801            "st y+, r18 \n\t")
802
803        "adc %[carry], __zero_reg__ \n\t"    /* Store carry bit (carry flag is cleared). */
804        "sbiw r28, 32 \n\t" /* move y back to point at result */
805
806        "sbiw r30, 1 \n\t" /* fix stack pointer */
807    	"in r0, __SREG__ \n\t"
808    	"cli \n\t"
809    	"out __SP_H__, r31 \n\t"
810    	"out __SREG__, r0 \n\t"
811    	"out __SP_L__, r30 \n\t"
812
813        : "+x" (product), [carry] "+r" (carry)
814        : "y" (result)
815        : "r0", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r30", "r31", "cc"
816    );
817
818    if (carry > 0) {
819        --carry;
820        uECC_vli_sub(result, result, curve_secp256k1.p, 32);
821    }
822    if (carry > 0) {
823        uECC_vli_sub(result, result, curve_secp256k1.p, 32);
824    }
825    if (uECC_vli_cmp_unsafe(result, curve_secp256k1.p, 32) > 0) {
826        uECC_vli_sub(result, result, curve_secp256k1.p, 32);
827    }
828}
829#define asm_mmod_fast_secp256k1 1
830#endif /* uECC_SUPPORTS_secp256k1 */
831
832#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
833
834/* ---- "Small" implementations ---- */
835
836#if !asm_add
837uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
838                                      const uECC_word_t *left,
839                                      const uECC_word_t *right,
840                                      wordcount_t num_words) {
841    volatile uECC_word_t *r = result;
842    uint8_t carry = 0;
843    uint8_t left_byte;
844    uint8_t right_byte;
845
846    __asm__ volatile (
847        "clc \n\t"
848
849        "1: \n\t"
850        "ld %[left], x+ \n\t"  /* Load left byte. */
851        "ld %[right], y+ \n\t" /* Load right byte. */
852        "adc %[left], %[right] \n\t" /* Add. */
853        "st z+, %[left] \n\t"  /* Store the result. */
854        "dec %[i] \n\t"
855        "brne 1b \n\t"
856
857        "adc %[carry], %[carry] \n\t" /* Store carry bit. */
858
859        : "+z" (r), "+x" (left), "+y" (right), [i] "+r" (num_words),
860            [carry] "+r" (carry), [left] "=&r" (left_byte), [right] "=&r" (right_byte)
861        :
862        : "cc"
863    );
864    return carry;
865}
866#define asm_add 1
867#endif
868
869#if !asm_sub
870uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
871                                      const uECC_word_t *left,
872                                      const uECC_word_t *right,
873                                      wordcount_t num_words) {
874    volatile uECC_word_t *r = result;
875    uint8_t borrow = 0;
876    uint8_t left_byte;
877    uint8_t right_byte;
878
879    __asm__ volatile (
880        "clc \n\t"
881
882        "1: \n\t"
883        "ld %[left], x+ \n\t"  /* Load left byte. */
884        "ld %[right], y+ \n\t" /* Load right byte. */
885        "sbc %[left], %[right] \n\t" /* Subtract. */
886        "st z+, %[left] \n\t"  /* Store the result. */
887        "dec %[i] \n\t"
888        "brne 1b \n\t"
889
890        "adc %[borrow], %[borrow] \n\t" /* Store carry bit in borrow. */
891
892        : "+z" (r), "+x" (left), "+y" (right), [i] "+r" (i),
893            [borrow] "+r" (borrow), [left] "=&r" (left_byte), [right] "=&r" (right_byte)
894        :
895        : "cc"
896    );
897    return borrow;
898}
899#define asm_sub 1
900#endif
901
902#if !asm_mult
903__attribute((noinline))
904uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
905                                const uECC_word_t *left,
906                                const uECC_word_t *right,
907                                wordcount_t num_words) {
908    volatile uECC_word_t *r = result;
909    uint8_t r0 = 0;
910    uint8_t r1 = 0;
911    uint8_t r2 = 0;
912    uint8_t zero = 0;
913    uint8_t k, i;
914
915    __asm__ volatile (
916        "ldi %[k], 1 \n\t" /* k = 1; k < num_words; ++k */
917
918        "1: \n\t"
919        "ldi %[i], 0 \n\t"  /* i = 0; i < k; ++i */
920
921        "add r28, %[k] \n\t" /* pre-add right ptr */
922        "adc r29, %[zero] \n\t"
923
924        "2: \n\t"
925        "ld r0, x+ \n\t"
926        "ld r1, -y \n\t"
927        "mul r0, r1 \n\t"
928
929        "add %[r0], r0 \n\t"
930        "adc %[r1], r1 \n\t"
931        "adc %[r2], %[zero] \n\t"
932
933        "inc %[i] \n\t"
934        "cp %[i], %[k] \n\t"
935        "brlo 2b \n\t" /* loop if i < k */
936
937        "sub r26, %[k] \n\t" /* fix up left ptr */
938        "sbc r27, %[zero] \n\t"
939
940        "st z+, %[r0] \n\t"  /* Store the result. */
941        "mov %[r0], %[r1] \n\t"
942        "mov %[r1], %[r2] \n\t"
943        "mov %[r2], %[zero] \n\t"
944
945        "inc %[k] \n\t"
946        "cp %[k], %[num] \n\t"
947        "brlo 1b \n\t" /* loop if k < num_words */
948
949        /* second half */
950        "mov %[k], %[num] \n\t" /* k = num_words; k > 0; --k */
951        "add r28, %[num] \n\t" /* move right ptr to point at the end of right */
952        "adc r29, %[zero] \n\t"
953
954        "1: \n\t"
955        "ldi %[i], 0 \n\t" /* i = 0; i < k; ++i */
956
957        "2: \n\t"
958        "ld r0, x+ \n\t"
959        "ld r1, -y \n\t"
960        "mul r0, r1 \n\t"
961
962        "add %[r0], r0 \n\t"
963        "adc %[r1], r1 \n\t"
964        "adc %[r2], %[zero] \n\t"
965
966        "inc %[i] \n\t"
967        "cp %[i], %[k] \n\t"
968        "brlo 2b \n\t" /* loop if i < k */
969
970        "add r28, %[k] \n\t" /* fix up right ptr */
971        "adc r29, %[zero] \n\t"
972
973        "st z+, %[r0] \n\t"  /* Store the result. */
974        "mov %[r0], %[r1] \n\t"
975        "mov %[r1], %[r2] \n\t"
976        "mov %[r2], %[zero] \n\t"
977
978        "dec %[k] \n\t"
979        "sub r26, %[k] \n\t" /* fix up left ptr (after k is decremented, so next time
980                                we start 1 higher) */
981        "sbc r27, %[zero] \n\t"
982
983        "cp %[k], %[zero] \n\t"
984        "brne 1b \n\t" /* loop if k > 0 */
985
986        "st z+, %[r0] \n\t"  /* Store last result byte. */
987        "eor r1, r1 \n\t" /* fix r1 to be 0 again */
988
989        : "+z" (result), "+x" (left), "+y" (right),
990          [r0] "+r" (r0), [r1] "+r" (r1), [r2] "+r" (r2),
991          [zero] "+r" (zero), [num] "+r" (num_words),
992          [k] "=&r" (k), [i] "=&r" (i)
993        :
994        : "r0", "cc"
995    );
996}
997#define asm_mult 1
998#endif
999
1000#if (uECC_SQUARE_FUNC && !asm_square)
1001uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
1002                                  const uECC_word_t *left,
1003                                  wordcount_t num_words) {
1004    volatile uECC_word_t *r = result;
1005    uint8_t r0 = 0;
1006    uint8_t r1 = 0;
1007    uint8_t r2 = 0;
1008    uint8_t zero = 0;
1009    uint8_t k;
1010
1011    __asm__ volatile (
1012        "ldi %[k], 1 \n\t" /* k = 1; k < num_words * 2; ++k */
1013
1014        "1: \n\t"
1015
1016        "movw r26, %[orig] \n\t"  /* copy orig ptr to 'left' ptr */
1017        "movw r30, %[orig] \n\t"  /* copy orig ptr to 'right' ptr */
1018        "cp %[k], %[num] \n\t"
1019        "brlo 2f \n\t"
1020        "breq 2f \n\t"
1021
1022        /* when k > num_words, we start from (k - num_words) on the 'left' ptr */
1023        "add r26, %[k] \n\t"
1024        "adc r27, %[zero] \n\t"
1025        "sub r26, %[num] \n\t"
1026        "sbc r27, %[zero] \n\t"
1027        "add r30, %[num] \n\t" /* move right ptr to point at the end */
1028        "adc r31, %[zero] \n\t"
1029        "rjmp 3f \n\t"
1030
1031        "2: \n\t" /* when k <= num_words, we add k to the 'right' ptr */
1032        "add r30, %[k] \n\t" /* pre-add 'right' ptr */
1033        "adc r31, %[zero] \n\t"
1034
1035        "3: \n\t"
1036        "ld r0, x+ \n\t"
1037        "cp r26, r30 \n\t" /* if left == right here, then we are done after this mult
1038                              (and we don't need to double) */
1039        "breq 4f \n\t"
1040        "ld r1, -z \n\t"
1041        "mul r0, r1 \n\t"
1042
1043        /* add twice since it costs the same as doubling */
1044        "add %[r0], r0 \n\t"
1045        "adc %[r1], r1 \n\t"
1046        "adc %[r2], %[zero] \n\t"
1047        "add %[r0], r0 \n\t"
1048        "adc %[r1], r1 \n\t"
1049        "adc %[r2], %[zero] \n\t"
1050
1051        "cpse r26, r30 \n\t" /* if left == right here, then we are done */
1052        "rjmp 3b \n\t"
1053        "rjmp 5f \n\t" /* skip code for non-doubled mult */
1054
1055        "4: \n\t"
1056        "ld r1, -z \n\t"
1057        "mul r0, r1 \n\t"
1058        "add %[r0], r0 \n\t"
1059        "adc %[r1], r1 \n\t"
1060        "adc %[r2], %[zero] \n\t"
1061
1062        "5: \n\t"
1063        "movw r30, %[result] \n\t" /* make z point to result */
1064        "st z+, %[r0] \n\t"        /* Store the result. */
1065        "movw %[result], r30 \n\t" /* update result ptr*/
1066        "mov %[r0], %[r1] \n\t"
1067        "mov %[r1], %[r2] \n\t"
1068        "mov %[r2], %[zero] \n\t"
1069
1070        "inc %[k] \n\t"
1071        "cp %[k], %[max] \n\t"
1072        "brlo 1b \n\t" /* loop if k < num_words * 2 */
1073
1074        "movw r30, %[result] \n\t"  /* make z point to result */
1075        "st z+, %[r0] \n\t"  /* Store last result byte. */
1076        "eor r1, r1 \n\t" /* fix r1 to be 0 again */
1077
1078        : [result] "+r" (r),
1079          [r0] "+r" (r0), [r1] "+r" (r1), [r2] "+r" (r2), [zero] "+r" (zero),
1080          [k] "=&a" (k)
1081        : [orig] "r" (left), [max] "r" ((uint8_t)(2 * num_words)),
1082          [num] "r" (num_words)
1083        : "r0", "r26", "r27", "r30", "r31", "cc"
1084    );
1085}
1086#define asm_square 1
1087#endif /* uECC_SQUARE_FUNC && !asm_square */
1088
1089#endif /* _UECC_ASM_AVR_H_ */
1090