• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
2
3#ifndef _UECC_ASM_ARM_H_
4#define _UECC_ASM_ARM_H_
5
6#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
7    #define uECC_MIN_WORDS 8
8#endif
9#if uECC_SUPPORTS_secp224r1
10    #undef uECC_MIN_WORDS
11    #define uECC_MIN_WORDS 7
12#endif
13#if uECC_SUPPORTS_secp192r1
14    #undef uECC_MIN_WORDS
15    #define uECC_MIN_WORDS 6
16#endif
17#if uECC_SUPPORTS_secp160r1
18    #undef uECC_MIN_WORDS
19    #define uECC_MIN_WORDS 5
20#endif
21
22#if (uECC_PLATFORM == uECC_arm_thumb)
23    #define REG_RW "+l"
24    #define REG_WRITE "=l"
25#else
26    #define REG_RW "+r"
27    #define REG_WRITE "=r"
28#endif
29
30#if (uECC_PLATFORM == uECC_arm_thumb || uECC_PLATFORM == uECC_arm_thumb2)
31    #define REG_RW_LO "+l"
32    #define REG_WRITE_LO "=l"
33#else
34    #define REG_RW_LO "+r"
35    #define REG_WRITE_LO "=r"
36#endif
37
38#if (uECC_PLATFORM == uECC_arm_thumb2)
39    #define RESUME_SYNTAX
40#else
41    #define RESUME_SYNTAX ".syntax divided \n\t"
42#endif
43
44#if (uECC_OPTIMIZATION_LEVEL >= 2)
45
46uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
47                                      const uECC_word_t *left,
48                                      const uECC_word_t *right,
49                                      wordcount_t num_words) {
50#if (uECC_MAX_WORDS != uECC_MIN_WORDS)
51  #if (uECC_PLATFORM == uECC_arm_thumb) || (uECC_PLATFORM == uECC_arm_thumb2)
52    uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 2 + 1;
53  #else /* ARM */
54    uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 4;
55  #endif
56#endif
57    uint32_t carry;
58    uint32_t left_word;
59    uint32_t right_word;
60
61    __asm__ volatile (
62        ".syntax unified \n\t"
63        "movs %[carry], #0 \n\t"
64    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
65        "adr %[left], 1f \n\t"
66        ".align 4 \n\t"
67        "adds %[jump], %[left] \n\t"
68    #endif
69
70        "ldmia %[lptr]!, {%[left]} \n\t"
71        "ldmia %[rptr]!, {%[right]} \n\t"
72        "adds %[left], %[right] \n\t"
73        "stmia %[dptr]!, {%[left]} \n\t"
74
75    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
76        "bx %[jump] \n\t"
77    #endif
78        "1: \n\t"
79        REPEAT(DEC(uECC_MAX_WORDS),
80            "ldmia %[lptr]!, {%[left]} \n\t"
81            "ldmia %[rptr]!, {%[right]} \n\t"
82            "adcs %[left], %[right] \n\t"
83            "stmia %[dptr]!, {%[left]} \n\t")
84
85        "adcs %[carry], %[carry] \n\t"
86        RESUME_SYNTAX
87        : [dptr] REG_RW_LO (result), [lptr] REG_RW_LO (left), [rptr] REG_RW_LO (right),
88    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
89          [jump] REG_RW_LO (jump),
90    #endif
91          [carry] REG_WRITE_LO (carry), [left] REG_WRITE_LO (left_word),
92          [right] REG_WRITE_LO (right_word)
93        :
94        : "cc", "memory"
95    );
96    return carry;
97}
98#define asm_add 1
99
100uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
101                                      const uECC_word_t *left,
102                                      const uECC_word_t *right,
103                                      wordcount_t num_words) {
104#if (uECC_MAX_WORDS != uECC_MIN_WORDS)
105  #if (uECC_PLATFORM == uECC_arm_thumb) || (uECC_PLATFORM == uECC_arm_thumb2)
106    uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 2 + 1;
107  #else /* ARM */
108    uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 4;
109  #endif
110#endif
111    uint32_t carry;
112    uint32_t left_word;
113    uint32_t right_word;
114
115    __asm__ volatile (
116        ".syntax unified \n\t"
117        "movs %[carry], #0 \n\t"
118    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
119        "adr %[left], 1f \n\t"
120        ".align 4 \n\t"
121        "adds %[jump], %[left] \n\t"
122    #endif
123
124        "ldmia %[lptr]!, {%[left]} \n\t"
125        "ldmia %[rptr]!, {%[right]} \n\t"
126        "subs %[left], %[right] \n\t"
127        "stmia %[dptr]!, {%[left]} \n\t"
128
129    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
130        "bx %[jump] \n\t"
131    #endif
132        "1: \n\t"
133        REPEAT(DEC(uECC_MAX_WORDS),
134            "ldmia %[lptr]!, {%[left]} \n\t"
135            "ldmia %[rptr]!, {%[right]} \n\t"
136            "sbcs %[left], %[right] \n\t"
137            "stmia %[dptr]!, {%[left]} \n\t")
138
139        "adcs %[carry], %[carry] \n\t"
140        RESUME_SYNTAX
141        : [dptr] REG_RW_LO (result), [lptr] REG_RW_LO (left), [rptr] REG_RW_LO (right),
142    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
143          [jump] REG_RW_LO (jump),
144    #endif
145          [carry] REG_WRITE_LO (carry), [left] REG_WRITE_LO (left_word),
146          [right] REG_WRITE_LO (right_word)
147        :
148        : "cc", "memory"
149    );
150    return !carry; /* Note that on ARM, carry flag set means "no borrow" when subtracting
151                      (for some reason...) */
152}
153#define asm_sub 1
154
155#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
156
157#if (uECC_OPTIMIZATION_LEVEL >= 3)
158
159#if (uECC_PLATFORM != uECC_arm_thumb)
160
161#if uECC_ARM_USE_UMAAL
162    #include "asm_arm_mult_square_umaal.inc"
163#else
164    #include "asm_arm_mult_square.inc"
165#endif
166
167#if (uECC_OPTIMIZATION_LEVEL == 3)
168
169uECC_VLI_API void uECC_vli_mult(uint32_t *result,
170                                const uint32_t *left,
171                                const uint32_t *right,
172                                wordcount_t num_words) {
173    register uint32_t *r0 __asm__("r0") = result;
174    register const uint32_t *r1 __asm__("r1") = left;
175    register const uint32_t *r2 __asm__("r2") = right;
176    register uint32_t r3 __asm__("r3") = num_words;
177
178    __asm__ volatile (
179        ".syntax unified \n\t"
180#if (uECC_MIN_WORDS == 5)
181        FAST_MULT_ASM_5
182    #if (uECC_MAX_WORDS > 5)
183        FAST_MULT_ASM_5_TO_6
184    #endif
185    #if (uECC_MAX_WORDS > 6)
186        FAST_MULT_ASM_6_TO_7
187    #endif
188    #if (uECC_MAX_WORDS > 7)
189        FAST_MULT_ASM_7_TO_8
190    #endif
191#elif (uECC_MIN_WORDS == 6)
192        FAST_MULT_ASM_6
193    #if (uECC_MAX_WORDS > 6)
194        FAST_MULT_ASM_6_TO_7
195    #endif
196    #if (uECC_MAX_WORDS > 7)
197        FAST_MULT_ASM_7_TO_8
198    #endif
199#elif (uECC_MIN_WORDS == 7)
200        FAST_MULT_ASM_7
201    #if (uECC_MAX_WORDS > 7)
202        FAST_MULT_ASM_7_TO_8
203    #endif
204#elif (uECC_MIN_WORDS == 8)
205        FAST_MULT_ASM_8
206#endif
207        "1: \n\t"
208        RESUME_SYNTAX
209        : "+r" (r0), "+r" (r1), "+r" (r2)
210        : "r" (r3)
211        : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
212    );
213}
214#define asm_mult 1
215
216#if uECC_SQUARE_FUNC
217uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
218                                  const uECC_word_t *left,
219                                  wordcount_t num_words) {
220    register uint32_t *r0 __asm__("r0") = result;
221    register const uint32_t *r1 __asm__("r1") = left;
222    register uint32_t r2 __asm__("r2") = num_words;
223
224    __asm__ volatile (
225        ".syntax unified \n\t"
226#if (uECC_MIN_WORDS == 5)
227        FAST_SQUARE_ASM_5
228    #if (uECC_MAX_WORDS > 5)
229        FAST_SQUARE_ASM_5_TO_6
230    #endif
231    #if (uECC_MAX_WORDS > 6)
232        FAST_SQUARE_ASM_6_TO_7
233    #endif
234    #if (uECC_MAX_WORDS > 7)
235        FAST_SQUARE_ASM_7_TO_8
236    #endif
237#elif (uECC_MIN_WORDS == 6)
238        FAST_SQUARE_ASM_6
239    #if (uECC_MAX_WORDS > 6)
240        FAST_SQUARE_ASM_6_TO_7
241    #endif
242    #if (uECC_MAX_WORDS > 7)
243        FAST_SQUARE_ASM_7_TO_8
244    #endif
245#elif (uECC_MIN_WORDS == 7)
246        FAST_SQUARE_ASM_7
247    #if (uECC_MAX_WORDS > 7)
248        FAST_SQUARE_ASM_7_TO_8
249    #endif
250#elif (uECC_MIN_WORDS == 8)
251        FAST_SQUARE_ASM_8
252#endif
253
254        "1: \n\t"
255        RESUME_SYNTAX
256        : "+r" (r0), "+r" (r1)
257        : "r" (r2)
258        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
259    );
260}
261#define asm_square 1
262#endif /* uECC_SQUARE_FUNC */
263
264#else /* (uECC_OPTIMIZATION_LEVEL > 3) */
265
266uECC_VLI_API void uECC_vli_mult(uint32_t *result,
267                                const uint32_t *left,
268                                const uint32_t *right,
269                                wordcount_t num_words) {
270    register uint32_t *r0 __asm__("r0") = result;
271    register const uint32_t *r1 __asm__("r1") = left;
272    register const uint32_t *r2 __asm__("r2") = right;
273    register uint32_t r3 __asm__("r3") = num_words;
274
275#if uECC_SUPPORTS_secp160r1
276    if (num_words == 5) {
277        __asm__ volatile (
278            ".syntax unified \n\t"
279            FAST_MULT_ASM_5
280            RESUME_SYNTAX
281            : "+r" (r0), "+r" (r1), "+r" (r2)
282            : "r" (r3)
283            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
284        );
285        return;
286    }
287#endif
288#if uECC_SUPPORTS_secp192r1
289    if (num_words == 6) {
290        __asm__ volatile (
291            ".syntax unified \n\t"
292            FAST_MULT_ASM_6
293            RESUME_SYNTAX
294            : "+r" (r0), "+r" (r1), "+r" (r2)
295            : "r" (r3)
296            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
297        );
298        return;
299    }
300#endif
301#if uECC_SUPPORTS_secp224r1
302    if (num_words == 7) {
303        __asm__ volatile (
304            ".syntax unified \n\t"
305            FAST_MULT_ASM_7
306            RESUME_SYNTAX
307            : "+r" (r0), "+r" (r1), "+r" (r2)
308            : "r" (r3)
309            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
310        );
311        return;
312    }
313#endif
314#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
315    if (num_words == 8) {
316        __asm__ volatile (
317            ".syntax unified \n\t"
318            FAST_MULT_ASM_8
319            RESUME_SYNTAX
320            : "+r" (r0), "+r" (r1), "+r" (r2)
321            : "r" (r3)
322            : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
323        );
324        return;
325    }
326#endif
327}
328#define asm_mult 1
329
330#if uECC_SQUARE_FUNC
331uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
332                                  const uECC_word_t *left,
333                                  wordcount_t num_words) {
334    register uint32_t *r0 __asm__("r0") = result;
335    register const uint32_t *r1 __asm__("r1") = left;
336    register uint32_t r2 __asm__("r2") = num_words;
337
338#if uECC_SUPPORTS_secp160r1
339    if (num_words == 5) {
340        __asm__ volatile (
341            ".syntax unified \n\t"
342            FAST_SQUARE_ASM_5
343            RESUME_SYNTAX
344            : "+r" (r0), "+r" (r1)
345            : "r" (r2)
346            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
347        );
348        return;
349    }
350#endif
351#if uECC_SUPPORTS_secp192r1
352    if (num_words == 6) {
353        __asm__ volatile (
354            ".syntax unified \n\t"
355            FAST_SQUARE_ASM_6
356            RESUME_SYNTAX
357            : "+r" (r0), "+r" (r1)
358            : "r" (r2)
359            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
360        );
361        return;
362    }
363#endif
364#if uECC_SUPPORTS_secp224r1
365    if (num_words == 7) {
366        __asm__ volatile (
367            ".syntax unified \n\t"
368            FAST_SQUARE_ASM_7
369            RESUME_SYNTAX
370            : "+r" (r0), "+r" (r1)
371            : "r" (r2)
372            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
373        );
374        return;
375    }
376#endif
377#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1)
378    if (num_words == 8) {
379        __asm__ volatile (
380            ".syntax unified \n\t"
381            FAST_SQUARE_ASM_8
382            RESUME_SYNTAX
383            : "+r" (r0), "+r" (r1)
384            : "r" (r2)
385            : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
386        );
387        return;
388    }
389#endif
390}
391#define asm_square 1
392#endif /* uECC_SQUARE_FUNC */
393
394#endif /* (uECC_OPTIMIZATION_LEVEL > 3) */
395
396#endif /* uECC_PLATFORM != uECC_arm_thumb */
397
398#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */
399
400/* ---- "Small" implementations ---- */
401
402#if !asm_add
403uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
404                                      const uECC_word_t *left,
405                                      const uECC_word_t *right,
406                                      wordcount_t num_words) {
407    uint32_t carry = 0;
408    uint32_t left_word;
409    uint32_t right_word;
410
411    __asm__ volatile (
412        ".syntax unified \n\t"
413        "1: \n\t"
414        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
415        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
416        "lsrs %[carry], #1 \n\t"          /* Set up carry flag (carry = 0 after this). */
417        "adcs %[left], %[left], %[right] \n\t"   /* Add with carry. */
418        "adcs %[carry], %[carry], %[carry] \n\t" /* Store carry bit. */
419        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
420        "subs %[ctr], #1 \n\t"            /* Decrement counter. */
421        "bne 1b \n\t"                     /* Loop until counter == 0. */
422        RESUME_SYNTAX
423        : [dptr] REG_RW (result), [lptr] REG_RW (left), [rptr] REG_RW (right),
424          [ctr] REG_RW (num_words), [carry] REG_RW (carry),
425          [left] REG_WRITE (left_word), [right] REG_WRITE (right_word)
426        :
427        : "cc", "memory"
428    );
429    return carry;
430}
431#define asm_add 1
432#endif
433
434#if !asm_sub
435uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
436                                      const uECC_word_t *left,
437                                      const uECC_word_t *right,
438                                      wordcount_t num_words) {
439    uint32_t carry = 1; /* carry = 1 initially (means don't borrow) */
440    uint32_t left_word;
441    uint32_t right_word;
442
443    __asm__ volatile (
444        ".syntax unified \n\t"
445        "1: \n\t"
446        "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
447        "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
448        "lsrs %[carry], #1 \n\t"          /* Set up carry flag (carry = 0 after this). */
449        "sbcs %[left], %[left], %[right] \n\t"   /* Subtract with borrow. */
450        "adcs %[carry], %[carry], %[carry] \n\t" /* Store carry bit. */
451        "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
452        "subs %[ctr], #1 \n\t"            /* Decrement counter. */
453        "bne 1b \n\t"                     /* Loop until counter == 0. */
454        RESUME_SYNTAX
455        : [dptr] REG_RW (result), [lptr] REG_RW (left), [rptr] REG_RW (right),
456          [ctr] REG_RW (num_words), [carry] REG_RW (carry),
457          [left] REG_WRITE (left_word), [right] REG_WRITE (right_word)
458        :
459        : "cc", "memory"
460    );
461    return !carry;
462}
463#define asm_sub 1
464#endif
465
466#if !asm_mult
467uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
468                                const uECC_word_t *left,
469                                const uECC_word_t *right,
470                                wordcount_t num_words) {
471#if (uECC_PLATFORM != uECC_arm_thumb)
472    uint32_t c0 = 0;
473    uint32_t c1 = 0;
474    uint32_t c2 = 0;
475    uint32_t k = 0;
476    uint32_t i;
477    uint32_t t0, t1;
478
479    __asm__ volatile (
480        ".syntax unified \n\t"
481
482        "1: \n\t" /* outer loop (k < num_words) */
483        "movs %[i], #0 \n\t" /* i = 0 */
484        "b 3f \n\t"
485
486        "2: \n\t" /* outer loop (k >= num_words) */
487        "movs %[i], %[k] \n\t"         /* i = k */
488        "subs %[i], %[last_word] \n\t" /* i = k - (num_words - 1) (times 4) */
489
490        "3: \n\t" /* inner loop */
491        "subs %[t0], %[k], %[i] \n\t" /* t0 = k-i */
492
493        "ldr %[t1], [%[right], %[t0]] \n\t" /* t1 = right[k - i] */
494        "ldr %[t0], [%[left], %[i]] \n\t"   /* t0 = left[i] */
495
496        "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */
497
498        "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
499        "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
500        "adcs %[c2], %[c2], #0 \n\t"    /* add carry to c2 */
501
502        "adds %[i], #4 \n\t"          /* i += 4 */
503        "cmp %[i], %[last_word] \n\t" /* i > (num_words - 1) (times 4)? */
504        "bgt 4f \n\t"                 /*   if so, exit the loop */
505        "cmp %[i], %[k] \n\t"         /* i <= k? */
506        "ble 3b \n\t"                 /*   if so, continue looping */
507
508        "4: \n\t" /* end inner loop */
509
510        "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */
511        "mov %[c0], %[c1] \n\t"       /* c0 = c1 */
512        "mov %[c1], %[c2] \n\t"       /* c1 = c2 */
513        "movs %[c2], #0 \n\t"         /* c2 = 0 */
514        "adds %[k], #4 \n\t"          /* k += 4 */
515        "cmp %[k], %[last_word] \n\t" /* k <= (num_words - 1) (times 4) ? */
516        "ble 1b \n\t"                 /*   if so, loop back, start with i = 0 */
517        "cmp %[k], %[last_word], lsl #1 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */
518        "ble 2b \n\t"                 /*   if so, loop back, start with i = (k + 1) - num_words */
519        /* end outer loop */
520
521        "str %[c0], [%[result], %[k]] \n\t" /* result[num_words * 2 - 1] = c0 */
522        RESUME_SYNTAX
523        : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2),
524          [k] "+r" (k), [i] "=&r" (i), [t0] "=&r" (t0), [t1] "=&r" (t1)
525        : [result] "r" (result), [left] "r" (left), [right] "r" (right),
526          [last_word] "r" ((num_words - 1) * 4)
527        : "cc", "memory"
528    );
529
530#else /* Thumb-1 */
531    uint32_t r4, r5, r6, r7;
532
533    __asm__ volatile (
534        ".syntax unified \n\t"
535        "subs %[r3], #1 \n\t" /* r3 = num_words - 1 */
536        "lsls %[r3], #2 \n\t" /* r3 = (num_words - 1) * 4 */
537        "mov r8, %[r3] \n\t"  /* r8 = (num_words - 1) * 4 */
538        "lsls %[r3], #1 \n\t" /* r3 = (num_words - 1) * 8 */
539        "mov r9, %[r3] \n\t"  /* r9 = (num_words - 1) * 8 */
540        "movs %[r3], #0 \n\t" /* c0 = 0 */
541        "movs %[r4], #0 \n\t" /* c1 = 0 */
542        "movs %[r5], #0 \n\t" /* c2 = 0 */
543        "movs %[r6], #0 \n\t" /* k = 0 */
544
545        "push {%[r0]} \n\t" /* keep result on the stack */
546
547        "1: \n\t" /* outer loop (k < num_words) */
548        "movs %[r7], #0 \n\t" /* r7 = i = 0 */
549        "b 3f \n\t"
550
551        "2: \n\t" /* outer loop (k >= num_words) */
552        "movs %[r7], %[r6] \n\t" /* r7 = k */
553        "mov %[r0], r8 \n\t"     /* r0 = (num_words - 1) * 4 */
554        "subs %[r7], %[r0] \n\t" /* r7 = i = k - (num_words - 1) (times 4) */
555
556        "3: \n\t" /* inner loop */
557        "mov r10, %[r3] \n\t"
558        "mov r11, %[r4] \n\t"
559        "mov r12, %[r5] \n\t"
560        "mov r14, %[r6] \n\t"
561        "subs %[r0], %[r6], %[r7] \n\t"          /* r0 = k - i */
562
563        "ldr %[r4], [%[r2], %[r0]] \n\t" /* r4 = right[k - i] */
564        "ldr %[r0], [%[r1], %[r7]] \n\t" /* r0 = left[i] */
565
566        "lsrs %[r3], %[r0], #16 \n\t" /* r3 = a1 */
567        "uxth %[r0], %[r0] \n\t"      /* r0 = a0 */
568
569        "lsrs %[r5], %[r4], #16 \n\t" /* r5 = b1 */
570        "uxth %[r4], %[r4] \n\t"      /* r4 = b0 */
571
572        "movs %[r6], %[r3] \n\t"        /* r6 = a1 */
573        "muls %[r6], %[r5], %[r6] \n\t" /* r6 = a1 * b1 */
574        "muls %[r3], %[r4], %[r3] \n\t" /* r3 = b0 * a1 */
575        "muls %[r5], %[r0], %[r5] \n\t" /* r5 = a0 * b1 */
576        "muls %[r0], %[r4], %[r0] \n\t" /* r0 = a0 * b0 */
577
578        /* Add middle terms */
579        "lsls %[r4], %[r3], #16 \n\t"
580        "lsrs %[r3], %[r3], #16 \n\t"
581        "adds %[r0], %[r4] \n\t"
582        "adcs %[r6], %[r3] \n\t"
583
584        "lsls %[r4], %[r5], #16 \n\t"
585        "lsrs %[r5], %[r5], #16 \n\t"
586        "adds %[r0], %[r4] \n\t"
587        "adcs %[r6], %[r5] \n\t"
588
589        "mov %[r3], r10\n\t"
590        "mov %[r4], r11\n\t"
591        "mov %[r5], r12\n\t"
592        "adds %[r3], %[r0] \n\t"         /* add low word to c0 */
593        "adcs %[r4], %[r6] \n\t"         /* add high word to c1, including carry */
594        "movs %[r0], #0 \n\t"            /* r0 = 0 (does not affect carry bit) */
595        "adcs %[r5], %[r0] \n\t"         /* add carry to c2 */
596
597        "mov %[r6], r14\n\t" /* r6 = k */
598
599        "adds %[r7], #4 \n\t"   /* i += 4 */
600        "cmp %[r7], r8 \n\t"    /* i > (num_words - 1) (times 4)? */
601        "bgt 4f \n\t"           /*   if so, exit the loop */
602        "cmp %[r7], %[r6] \n\t" /* i <= k? */
603        "ble 3b \n\t"           /*   if so, continue looping */
604
605        "4: \n\t" /* end inner loop */
606
607        "ldr %[r0], [sp, #0] \n\t" /* r0 = result */
608
609        "str %[r3], [%[r0], %[r6]] \n\t" /* result[k] = c0 */
610        "mov %[r3], %[r4] \n\t"          /* c0 = c1 */
611        "mov %[r4], %[r5] \n\t"          /* c1 = c2 */
612        "movs %[r5], #0 \n\t"            /* c2 = 0 */
613        "adds %[r6], #4 \n\t"            /* k += 4 */
614        "cmp %[r6], r8 \n\t"             /* k <= (num_words - 1) (times 4) ? */
615        "ble 1b \n\t"                    /*   if so, loop back, start with i = 0 */
616        "cmp %[r6], r9 \n\t"             /* k <= (num_words * 2 - 2) (times 4) ? */
617        "ble 2b \n\t"                    /*   if so, loop back, with i = (k + 1) - num_words */
618        /* end outer loop */
619
620        "str %[r3], [%[r0], %[r6]] \n\t" /* result[num_words * 2 - 1] = c0 */
621        "pop {%[r0]} \n\t"               /* pop result off the stack */
622
623        ".syntax divided \n\t"
624        : [r3] "+l" (num_words), [r4] "=&l" (r4),
625          [r5] "=&l" (r5), [r6] "=&l" (r6), [r7] "=&l" (r7)
626        : [r0] "l" (result), [r1] "l" (left), [r2] "l" (right)
627        : "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
628    );
629#endif
630}
631#define asm_mult 1
632#endif
633
634#if uECC_SQUARE_FUNC
635#if !asm_square
636uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
637                                  const uECC_word_t *left,
638                                  wordcount_t num_words) {
639#if (uECC_PLATFORM != uECC_arm_thumb)
640    uint32_t c0 = 0;
641    uint32_t c1 = 0;
642    uint32_t c2 = 0;
643    uint32_t k = 0;
644    uint32_t i, tt;
645    uint32_t t0, t1;
646
647    __asm__ volatile (
648        ".syntax unified \n\t"
649
650        "1: \n\t" /* outer loop (k < num_words) */
651        "movs %[i], #0 \n\t" /* i = 0 */
652        "b 3f \n\t"
653
654        "2: \n\t" /* outer loop (k >= num_words) */
655        "movs %[i], %[k] \n\t"         /* i = k */
656        "subs %[i], %[last_word] \n\t" /* i = k - (num_words - 1) (times 4) */
657
658        "3: \n\t" /* inner loop */
659        "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */
660
661        "ldr %[t1], [%[left], %[tt]] \n\t" /* t1 = left[k - i] */
662        "ldr %[t0], [%[left], %[i]] \n\t"  /* t0 = left[i] */
663
664        "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */
665
666        "cmp %[i], %[tt] \n\t"      /* (i < k - i) ? */
667        "bge 4f \n\t"               /*   if i >= k - i, skip */
668        "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
669        "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
670        "adcs %[c2], %[c2], #0 \n\t"    /* add carry to c2 */
671
672        "4: \n\t"
673        "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */
674        "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */
675        "adcs %[c2], %[c2], #0 \n\t"    /* add carry to c2 */
676
677        "adds %[i], #4 \n\t"          /* i += 4 */
678        "cmp %[i], %[k] \n\t"         /* i >= k? */
679        "bge 5f \n\t"                 /*   if so, exit the loop */
680        "subs %[tt], %[k], %[i] \n\t" /* tt = k - i */
681        "cmp %[i], %[tt] \n\t"        /* i <= k - i? */
682        "ble 3b \n\t"                 /*   if so, continue looping */
683
684        "5: \n\t" /* end inner loop */
685
686        "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */
687        "mov %[c0], %[c1] \n\t"       /* c0 = c1 */
688        "mov %[c1], %[c2] \n\t"       /* c1 = c2 */
689        "movs %[c2], #0 \n\t"         /* c2 = 0 */
690        "adds %[k], #4 \n\t"          /* k += 4 */
691        "cmp %[k], %[last_word] \n\t" /* k <= (num_words - 1) (times 4) ? */
692        "ble 1b \n\t"                 /*   if so, loop back, start with i = 0 */
693        "cmp %[k], %[last_word], lsl #1 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */
694        "ble 2b \n\t"                 /*   if so, loop back, start with i = (k + 1) - num_words */
695        /* end outer loop */
696
697        "str %[c0], [%[result], %[k]] \n\t" /* result[num_words * 2 - 1] = c0 */
698        RESUME_SYNTAX
699        : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2),
700          [k] "+r" (k), [i] "=&r" (i), [tt] "=&r" (tt), [t0] "=&r" (t0), [t1] "=&r" (t1)
701        : [result] "r" (result), [left] "r" (left), [last_word] "r" ((num_words - 1) * 4)
702        : "cc", "memory"
703    );
704
705#else
706    uint32_t r3, r4, r5, r6, r7;
707
708    __asm__ volatile (
709        ".syntax unified \n\t"
710        "subs %[r2], #1 \n\t" /* r2 = num_words - 1 */
711        "lsls %[r2], #2 \n\t" /* r2 = (num_words - 1) * 4 */
712        "mov r8, %[r2] \n\t"  /* r8 = (num_words - 1) * 4 */
713        "lsls %[r2], #1 \n\t" /* r2 = (num_words - 1) * 8 */
714        "mov r9, %[r2] \n\t"  /* r9 = (num_words - 1) * 8 */
715        "movs %[r2], #0 \n\t" /* c0 = 0 */
716        "movs %[r3], #0 \n\t" /* c1 = 0 */
717        "movs %[r4], #0 \n\t" /* c2 = 0 */
718        "movs %[r5], #0 \n\t" /* k = 0 */
719
720        "push {%[r0]} \n\t" /* keep result on the stack */
721
722        "1: \n\t" /* outer loop (k < num_words) */
723        "movs %[r6], #0 \n\t" /* r6 = i = 0 */
724        "b 3f \n\t"
725
726        "2: \n\t" /* outer loop (k >= num_words) */
727        "movs %[r6], %[r5] \n\t" /* r6 = k */
728        "mov %[r0], r8 \n\t"     /* r0 = (num_words - 1) * 4 */
729        "subs %[r6], %[r0] \n\t" /* r6 = i = k - (num_words - 1) (times 4) */
730
731        "3: \n\t" /* inner loop */
732        "mov r10, %[r2] \n\t"
733        "mov r11, %[r3] \n\t"
734        "mov r12, %[r4] \n\t"
735        "mov r14, %[r5] \n\t"
736        "subs %[r7], %[r5], %[r6] \n\t"  /* r7 = k - i */
737
738        "ldr %[r3], [%[r1], %[r7]] \n\t" /* r3 = left[k - i] */
739        "ldr %[r0], [%[r1], %[r6]] \n\t" /* r0 = left[i] */
740
741        "lsrs %[r2], %[r0], #16 \n\t" /* r2 = a1 */
742        "uxth %[r0], %[r0] \n\t"      /* r0 = a0 */
743
744        "lsrs %[r4], %[r3], #16 \n\t" /* r4 = b1 */
745        "uxth %[r3], %[r3] \n\t"      /* r3 = b0 */
746
747        "movs %[r5], %[r2] \n\t"        /* r5 = a1 */
748        "muls %[r5], %[r4], %[r5] \n\t" /* r5 = a1 * b1 */
749        "muls %[r2], %[r3], %[r2] \n\t" /* r2 = b0 * a1 */
750        "muls %[r4], %[r0], %[r4] \n\t" /* r4 = a0 * b1 */
751        "muls %[r0], %[r3], %[r0] \n\t" /* r0 = a0 * b0 */
752
753        /* Add middle terms */
754        "lsls %[r3], %[r2], #16 \n\t"
755        "lsrs %[r2], %[r2], #16 \n\t"
756        "adds %[r0], %[r3] \n\t"
757        "adcs %[r5], %[r2] \n\t"
758
759        "lsls %[r3], %[r4], #16 \n\t"
760        "lsrs %[r4], %[r4], #16 \n\t"
761        "adds %[r0], %[r3] \n\t"
762        "adcs %[r5], %[r4] \n\t"
763
764        /* Add to acc, doubling if necessary */
765        "mov %[r2], r10\n\t"
766        "mov %[r3], r11\n\t"
767        "mov %[r4], r12\n\t"
768
769        "cmp %[r6], %[r7] \n\t"    /* (i < k - i) ? */
770        "bge 4f \n\t"            /*   if i >= k - i, skip */
771        "movs %[r7], #0 \n\t"    /* r7 = 0 */
772        "adds %[r2], %[r0] \n\t" /* add low word to c0 */
773        "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */
774        "adcs %[r4], %[r7] \n\t" /* add carry to c2 */
775        "4: \n\t"
776        "movs %[r7], #0 \n\t"    /* r7 = 0 */
777        "adds %[r2], %[r0] \n\t" /* add low word to c0 */
778        "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */
779        "adcs %[r4], %[r7] \n\t" /* add carry to c2 */
780
781        "mov %[r5], r14\n\t" /* r5 = k */
782
783        "adds %[r6], #4 \n\t"           /* i += 4 */
784        "cmp %[r6], %[r5] \n\t"         /* i >= k? */
785        "bge 5f \n\t"                   /*   if so, exit the loop */
786        "subs %[r7], %[r5], %[r6] \n\t" /* r7 = k - i */
787        "cmp %[r6], %[r7] \n\t"         /* i <= k - i? */
788        "ble 3b \n\t"                   /*   if so, continue looping */
789
790        "5: \n\t" /* end inner loop */
791
792        "ldr %[r0], [sp, #0] \n\t" /* r0 = result */
793
794        "str %[r2], [%[r0], %[r5]] \n\t" /* result[k] = c0 */
795        "mov %[r2], %[r3] \n\t"          /* c0 = c1 */
796        "mov %[r3], %[r4] \n\t"          /* c1 = c2 */
797        "movs %[r4], #0 \n\t"            /* c2 = 0 */
798        "adds %[r5], #4 \n\t"            /* k += 4 */
799        "cmp %[r5], r8 \n\t"             /* k <= (num_words - 1) (times 4) ? */
800        "ble 1b \n\t"                    /*   if so, loop back, start with i = 0 */
801        "cmp %[r5], r9 \n\t"             /* k <= (num_words * 2 - 2) (times 4) ? */
802        "ble 2b \n\t"                    /*   if so, loop back, with i = (k + 1) - num_words */
803        /* end outer loop */
804
805        "str %[r2], [%[r0], %[r5]] \n\t" /* result[num_words * 2 - 1] = c0 */
806        "pop {%[r0]} \n\t"               /* pop result off the stack */
807
808        ".syntax divided \n\t"
809        : [r2] "+l" (num_words), [r3] "=&l" (r3), [r4] "=&l" (r4),
810          [r5] "=&l" (r5), [r6] "=&l" (r6), [r7] "=&l" (r7)
811        : [r0] "l" (result), [r1] "l" (left)
812        : "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory"
813    );
814#endif
815}
816#define asm_square 1
817#endif
818#endif /* uECC_SQUARE_FUNC */
819
820#endif /* _UECC_ASM_ARM_H_ */
821