1/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */ 2 3#ifndef _UECC_ASM_ARM_H_ 4#define _UECC_ASM_ARM_H_ 5 6#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1) 7 #define uECC_MIN_WORDS 8 8#endif 9#if uECC_SUPPORTS_secp224r1 10 #undef uECC_MIN_WORDS 11 #define uECC_MIN_WORDS 7 12#endif 13#if uECC_SUPPORTS_secp192r1 14 #undef uECC_MIN_WORDS 15 #define uECC_MIN_WORDS 6 16#endif 17#if uECC_SUPPORTS_secp160r1 18 #undef uECC_MIN_WORDS 19 #define uECC_MIN_WORDS 5 20#endif 21 22#if (uECC_PLATFORM == uECC_arm_thumb) 23 #define REG_RW "+l" 24 #define REG_WRITE "=l" 25#else 26 #define REG_RW "+r" 27 #define REG_WRITE "=r" 28#endif 29 30#if (uECC_PLATFORM == uECC_arm_thumb || uECC_PLATFORM == uECC_arm_thumb2) 31 #define REG_RW_LO "+l" 32 #define REG_WRITE_LO "=l" 33#else 34 #define REG_RW_LO "+r" 35 #define REG_WRITE_LO "=r" 36#endif 37 38#if (uECC_PLATFORM == uECC_arm_thumb2) 39 #define RESUME_SYNTAX 40#else 41 #define RESUME_SYNTAX ".syntax divided \n\t" 42#endif 43 44#if (uECC_OPTIMIZATION_LEVEL >= 2) 45 46uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result, 47 const uECC_word_t *left, 48 const uECC_word_t *right, 49 wordcount_t num_words) { 50#if (uECC_MAX_WORDS != uECC_MIN_WORDS) 51 #if (uECC_PLATFORM == uECC_arm_thumb) || (uECC_PLATFORM == uECC_arm_thumb2) 52 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 2 + 1; 53 #else /* ARM */ 54 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 4; 55 #endif 56#endif 57 uint32_t carry; 58 uint32_t left_word; 59 uint32_t right_word; 60 61 __asm__ volatile ( 62 ".syntax unified \n\t" 63 "movs %[carry], #0 \n\t" 64 #if (uECC_MAX_WORDS != uECC_MIN_WORDS) 65 "adr %[left], 1f \n\t" 66 ".align 4 \n\t" 67 "adds %[jump], %[left] \n\t" 68 #endif 69 70 "ldmia %[lptr]!, {%[left]} \n\t" 71 "ldmia %[rptr]!, {%[right]} \n\t" 72 "adds %[left], %[right] \n\t" 73 "stmia %[dptr]!, {%[left]} \n\t" 74 75 #if (uECC_MAX_WORDS != uECC_MIN_WORDS) 76 "bx %[jump] \n\t" 77 #endif 78 "1: \n\t" 79 REPEAT(DEC(uECC_MAX_WORDS), 80 "ldmia %[lptr]!, {%[left]} \n\t" 81 "ldmia %[rptr]!, {%[right]} \n\t" 82 "adcs %[left], %[right] \n\t" 83 "stmia %[dptr]!, {%[left]} \n\t") 84 85 "adcs %[carry], %[carry] \n\t" 86 RESUME_SYNTAX 87 : [dptr] REG_RW_LO (result), [lptr] REG_RW_LO (left), [rptr] REG_RW_LO (right), 88 #if (uECC_MAX_WORDS != uECC_MIN_WORDS) 89 [jump] REG_RW_LO (jump), 90 #endif 91 [carry] REG_WRITE_LO (carry), [left] REG_WRITE_LO (left_word), 92 [right] REG_WRITE_LO (right_word) 93 : 94 : "cc", "memory" 95 ); 96 return carry; 97} 98#define asm_add 1 99 100uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result, 101 const uECC_word_t *left, 102 const uECC_word_t *right, 103 wordcount_t num_words) { 104#if (uECC_MAX_WORDS != uECC_MIN_WORDS) 105 #if (uECC_PLATFORM == uECC_arm_thumb) || (uECC_PLATFORM == uECC_arm_thumb2) 106 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 2 + 1; 107 #else /* ARM */ 108 uint32_t jump = (uECC_MAX_WORDS - num_words) * 4 * 4; 109 #endif 110#endif 111 uint32_t carry; 112 uint32_t left_word; 113 uint32_t right_word; 114 115 __asm__ volatile ( 116 ".syntax unified \n\t" 117 "movs %[carry], #0 \n\t" 118 #if (uECC_MAX_WORDS != uECC_MIN_WORDS) 119 "adr %[left], 1f \n\t" 120 ".align 4 \n\t" 121 "adds %[jump], %[left] \n\t" 122 #endif 123 124 "ldmia %[lptr]!, {%[left]} \n\t" 125 "ldmia %[rptr]!, {%[right]} \n\t" 126 "subs %[left], %[right] \n\t" 127 "stmia %[dptr]!, {%[left]} \n\t" 128 129 #if (uECC_MAX_WORDS != uECC_MIN_WORDS) 130 "bx %[jump] \n\t" 131 #endif 132 "1: \n\t" 133 REPEAT(DEC(uECC_MAX_WORDS), 134 "ldmia %[lptr]!, {%[left]} \n\t" 135 "ldmia %[rptr]!, {%[right]} \n\t" 136 "sbcs %[left], %[right] \n\t" 137 "stmia %[dptr]!, {%[left]} \n\t") 138 139 "adcs %[carry], %[carry] \n\t" 140 RESUME_SYNTAX 141 : [dptr] REG_RW_LO (result), [lptr] REG_RW_LO (left), [rptr] REG_RW_LO (right), 142 #if (uECC_MAX_WORDS != uECC_MIN_WORDS) 143 [jump] REG_RW_LO (jump), 144 #endif 145 [carry] REG_WRITE_LO (carry), [left] REG_WRITE_LO (left_word), 146 [right] REG_WRITE_LO (right_word) 147 : 148 : "cc", "memory" 149 ); 150 return !carry; /* Note that on ARM, carry flag set means "no borrow" when subtracting 151 (for some reason...) */ 152} 153#define asm_sub 1 154 155#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */ 156 157#if (uECC_OPTIMIZATION_LEVEL >= 3) 158 159#if (uECC_PLATFORM != uECC_arm_thumb) 160 161#if uECC_ARM_USE_UMAAL 162 #include "asm_arm_mult_square_umaal.inc" 163#else 164 #include "asm_arm_mult_square.inc" 165#endif 166 167#if (uECC_OPTIMIZATION_LEVEL == 3) 168 169uECC_VLI_API void uECC_vli_mult(uint32_t *result, 170 const uint32_t *left, 171 const uint32_t *right, 172 wordcount_t num_words) { 173 register uint32_t *r0 __asm__("r0") = result; 174 register const uint32_t *r1 __asm__("r1") = left; 175 register const uint32_t *r2 __asm__("r2") = right; 176 register uint32_t r3 __asm__("r3") = num_words; 177 178 __asm__ volatile ( 179 ".syntax unified \n\t" 180#if (uECC_MIN_WORDS == 5) 181 FAST_MULT_ASM_5 182 #if (uECC_MAX_WORDS > 5) 183 FAST_MULT_ASM_5_TO_6 184 #endif 185 #if (uECC_MAX_WORDS > 6) 186 FAST_MULT_ASM_6_TO_7 187 #endif 188 #if (uECC_MAX_WORDS > 7) 189 FAST_MULT_ASM_7_TO_8 190 #endif 191#elif (uECC_MIN_WORDS == 6) 192 FAST_MULT_ASM_6 193 #if (uECC_MAX_WORDS > 6) 194 FAST_MULT_ASM_6_TO_7 195 #endif 196 #if (uECC_MAX_WORDS > 7) 197 FAST_MULT_ASM_7_TO_8 198 #endif 199#elif (uECC_MIN_WORDS == 7) 200 FAST_MULT_ASM_7 201 #if (uECC_MAX_WORDS > 7) 202 FAST_MULT_ASM_7_TO_8 203 #endif 204#elif (uECC_MIN_WORDS == 8) 205 FAST_MULT_ASM_8 206#endif 207 "1: \n\t" 208 RESUME_SYNTAX 209 : "+r" (r0), "+r" (r1), "+r" (r2) 210 : "r" (r3) 211 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 212 ); 213} 214#define asm_mult 1 215 216#if uECC_SQUARE_FUNC 217uECC_VLI_API void uECC_vli_square(uECC_word_t *result, 218 const uECC_word_t *left, 219 wordcount_t num_words) { 220 register uint32_t *r0 __asm__("r0") = result; 221 register const uint32_t *r1 __asm__("r1") = left; 222 register uint32_t r2 __asm__("r2") = num_words; 223 224 __asm__ volatile ( 225 ".syntax unified \n\t" 226#if (uECC_MIN_WORDS == 5) 227 FAST_SQUARE_ASM_5 228 #if (uECC_MAX_WORDS > 5) 229 FAST_SQUARE_ASM_5_TO_6 230 #endif 231 #if (uECC_MAX_WORDS > 6) 232 FAST_SQUARE_ASM_6_TO_7 233 #endif 234 #if (uECC_MAX_WORDS > 7) 235 FAST_SQUARE_ASM_7_TO_8 236 #endif 237#elif (uECC_MIN_WORDS == 6) 238 FAST_SQUARE_ASM_6 239 #if (uECC_MAX_WORDS > 6) 240 FAST_SQUARE_ASM_6_TO_7 241 #endif 242 #if (uECC_MAX_WORDS > 7) 243 FAST_SQUARE_ASM_7_TO_8 244 #endif 245#elif (uECC_MIN_WORDS == 7) 246 FAST_SQUARE_ASM_7 247 #if (uECC_MAX_WORDS > 7) 248 FAST_SQUARE_ASM_7_TO_8 249 #endif 250#elif (uECC_MIN_WORDS == 8) 251 FAST_SQUARE_ASM_8 252#endif 253 254 "1: \n\t" 255 RESUME_SYNTAX 256 : "+r" (r0), "+r" (r1) 257 : "r" (r2) 258 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 259 ); 260} 261#define asm_square 1 262#endif /* uECC_SQUARE_FUNC */ 263 264#else /* (uECC_OPTIMIZATION_LEVEL > 3) */ 265 266uECC_VLI_API void uECC_vli_mult(uint32_t *result, 267 const uint32_t *left, 268 const uint32_t *right, 269 wordcount_t num_words) { 270 register uint32_t *r0 __asm__("r0") = result; 271 register const uint32_t *r1 __asm__("r1") = left; 272 register const uint32_t *r2 __asm__("r2") = right; 273 register uint32_t r3 __asm__("r3") = num_words; 274 275#if uECC_SUPPORTS_secp160r1 276 if (num_words == 5) { 277 __asm__ volatile ( 278 ".syntax unified \n\t" 279 FAST_MULT_ASM_5 280 RESUME_SYNTAX 281 : "+r" (r0), "+r" (r1), "+r" (r2) 282 : "r" (r3) 283 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 284 ); 285 return; 286 } 287#endif 288#if uECC_SUPPORTS_secp192r1 289 if (num_words == 6) { 290 __asm__ volatile ( 291 ".syntax unified \n\t" 292 FAST_MULT_ASM_6 293 RESUME_SYNTAX 294 : "+r" (r0), "+r" (r1), "+r" (r2) 295 : "r" (r3) 296 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 297 ); 298 return; 299 } 300#endif 301#if uECC_SUPPORTS_secp224r1 302 if (num_words == 7) { 303 __asm__ volatile ( 304 ".syntax unified \n\t" 305 FAST_MULT_ASM_7 306 RESUME_SYNTAX 307 : "+r" (r0), "+r" (r1), "+r" (r2) 308 : "r" (r3) 309 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 310 ); 311 return; 312 } 313#endif 314#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1) 315 if (num_words == 8) { 316 __asm__ volatile ( 317 ".syntax unified \n\t" 318 FAST_MULT_ASM_8 319 RESUME_SYNTAX 320 : "+r" (r0), "+r" (r1), "+r" (r2) 321 : "r" (r3) 322 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 323 ); 324 return; 325 } 326#endif 327} 328#define asm_mult 1 329 330#if uECC_SQUARE_FUNC 331uECC_VLI_API void uECC_vli_square(uECC_word_t *result, 332 const uECC_word_t *left, 333 wordcount_t num_words) { 334 register uint32_t *r0 __asm__("r0") = result; 335 register const uint32_t *r1 __asm__("r1") = left; 336 register uint32_t r2 __asm__("r2") = num_words; 337 338#if uECC_SUPPORTS_secp160r1 339 if (num_words == 5) { 340 __asm__ volatile ( 341 ".syntax unified \n\t" 342 FAST_SQUARE_ASM_5 343 RESUME_SYNTAX 344 : "+r" (r0), "+r" (r1) 345 : "r" (r2) 346 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 347 ); 348 return; 349 } 350#endif 351#if uECC_SUPPORTS_secp192r1 352 if (num_words == 6) { 353 __asm__ volatile ( 354 ".syntax unified \n\t" 355 FAST_SQUARE_ASM_6 356 RESUME_SYNTAX 357 : "+r" (r0), "+r" (r1) 358 : "r" (r2) 359 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 360 ); 361 return; 362 } 363#endif 364#if uECC_SUPPORTS_secp224r1 365 if (num_words == 7) { 366 __asm__ volatile ( 367 ".syntax unified \n\t" 368 FAST_SQUARE_ASM_7 369 RESUME_SYNTAX 370 : "+r" (r0), "+r" (r1) 371 : "r" (r2) 372 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 373 ); 374 return; 375 } 376#endif 377#if (uECC_SUPPORTS_secp256r1 || uECC_SUPPORTS_secp256k1) 378 if (num_words == 8) { 379 __asm__ volatile ( 380 ".syntax unified \n\t" 381 FAST_SQUARE_ASM_8 382 RESUME_SYNTAX 383 : "+r" (r0), "+r" (r1) 384 : "r" (r2) 385 : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 386 ); 387 return; 388 } 389#endif 390} 391#define asm_square 1 392#endif /* uECC_SQUARE_FUNC */ 393 394#endif /* (uECC_OPTIMIZATION_LEVEL > 3) */ 395 396#endif /* uECC_PLATFORM != uECC_arm_thumb */ 397 398#endif /* (uECC_OPTIMIZATION_LEVEL >= 3) */ 399 400/* ---- "Small" implementations ---- */ 401 402#if !asm_add 403uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result, 404 const uECC_word_t *left, 405 const uECC_word_t *right, 406 wordcount_t num_words) { 407 uint32_t carry = 0; 408 uint32_t left_word; 409 uint32_t right_word; 410 411 __asm__ volatile ( 412 ".syntax unified \n\t" 413 "1: \n\t" 414 "ldmia %[lptr]!, {%[left]} \n\t" /* Load left word. */ 415 "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */ 416 "lsrs %[carry], #1 \n\t" /* Set up carry flag (carry = 0 after this). */ 417 "adcs %[left], %[left], %[right] \n\t" /* Add with carry. */ 418 "adcs %[carry], %[carry], %[carry] \n\t" /* Store carry bit. */ 419 "stmia %[dptr]!, {%[left]} \n\t" /* Store result word. */ 420 "subs %[ctr], #1 \n\t" /* Decrement counter. */ 421 "bne 1b \n\t" /* Loop until counter == 0. */ 422 RESUME_SYNTAX 423 : [dptr] REG_RW (result), [lptr] REG_RW (left), [rptr] REG_RW (right), 424 [ctr] REG_RW (num_words), [carry] REG_RW (carry), 425 [left] REG_WRITE (left_word), [right] REG_WRITE (right_word) 426 : 427 : "cc", "memory" 428 ); 429 return carry; 430} 431#define asm_add 1 432#endif 433 434#if !asm_sub 435uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result, 436 const uECC_word_t *left, 437 const uECC_word_t *right, 438 wordcount_t num_words) { 439 uint32_t carry = 1; /* carry = 1 initially (means don't borrow) */ 440 uint32_t left_word; 441 uint32_t right_word; 442 443 __asm__ volatile ( 444 ".syntax unified \n\t" 445 "1: \n\t" 446 "ldmia %[lptr]!, {%[left]} \n\t" /* Load left word. */ 447 "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */ 448 "lsrs %[carry], #1 \n\t" /* Set up carry flag (carry = 0 after this). */ 449 "sbcs %[left], %[left], %[right] \n\t" /* Subtract with borrow. */ 450 "adcs %[carry], %[carry], %[carry] \n\t" /* Store carry bit. */ 451 "stmia %[dptr]!, {%[left]} \n\t" /* Store result word. */ 452 "subs %[ctr], #1 \n\t" /* Decrement counter. */ 453 "bne 1b \n\t" /* Loop until counter == 0. */ 454 RESUME_SYNTAX 455 : [dptr] REG_RW (result), [lptr] REG_RW (left), [rptr] REG_RW (right), 456 [ctr] REG_RW (num_words), [carry] REG_RW (carry), 457 [left] REG_WRITE (left_word), [right] REG_WRITE (right_word) 458 : 459 : "cc", "memory" 460 ); 461 return !carry; 462} 463#define asm_sub 1 464#endif 465 466#if !asm_mult 467uECC_VLI_API void uECC_vli_mult(uECC_word_t *result, 468 const uECC_word_t *left, 469 const uECC_word_t *right, 470 wordcount_t num_words) { 471#if (uECC_PLATFORM != uECC_arm_thumb) 472 uint32_t c0 = 0; 473 uint32_t c1 = 0; 474 uint32_t c2 = 0; 475 uint32_t k = 0; 476 uint32_t i; 477 uint32_t t0, t1; 478 479 __asm__ volatile ( 480 ".syntax unified \n\t" 481 482 "1: \n\t" /* outer loop (k < num_words) */ 483 "movs %[i], #0 \n\t" /* i = 0 */ 484 "b 3f \n\t" 485 486 "2: \n\t" /* outer loop (k >= num_words) */ 487 "movs %[i], %[k] \n\t" /* i = k */ 488 "subs %[i], %[last_word] \n\t" /* i = k - (num_words - 1) (times 4) */ 489 490 "3: \n\t" /* inner loop */ 491 "subs %[t0], %[k], %[i] \n\t" /* t0 = k-i */ 492 493 "ldr %[t1], [%[right], %[t0]] \n\t" /* t1 = right[k - i] */ 494 "ldr %[t0], [%[left], %[i]] \n\t" /* t0 = left[i] */ 495 496 "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */ 497 498 "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */ 499 "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */ 500 "adcs %[c2], %[c2], #0 \n\t" /* add carry to c2 */ 501 502 "adds %[i], #4 \n\t" /* i += 4 */ 503 "cmp %[i], %[last_word] \n\t" /* i > (num_words - 1) (times 4)? */ 504 "bgt 4f \n\t" /* if so, exit the loop */ 505 "cmp %[i], %[k] \n\t" /* i <= k? */ 506 "ble 3b \n\t" /* if so, continue looping */ 507 508 "4: \n\t" /* end inner loop */ 509 510 "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */ 511 "mov %[c0], %[c1] \n\t" /* c0 = c1 */ 512 "mov %[c1], %[c2] \n\t" /* c1 = c2 */ 513 "movs %[c2], #0 \n\t" /* c2 = 0 */ 514 "adds %[k], #4 \n\t" /* k += 4 */ 515 "cmp %[k], %[last_word] \n\t" /* k <= (num_words - 1) (times 4) ? */ 516 "ble 1b \n\t" /* if so, loop back, start with i = 0 */ 517 "cmp %[k], %[last_word], lsl #1 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */ 518 "ble 2b \n\t" /* if so, loop back, start with i = (k + 1) - num_words */ 519 /* end outer loop */ 520 521 "str %[c0], [%[result], %[k]] \n\t" /* result[num_words * 2 - 1] = c0 */ 522 RESUME_SYNTAX 523 : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2), 524 [k] "+r" (k), [i] "=&r" (i), [t0] "=&r" (t0), [t1] "=&r" (t1) 525 : [result] "r" (result), [left] "r" (left), [right] "r" (right), 526 [last_word] "r" ((num_words - 1) * 4) 527 : "cc", "memory" 528 ); 529 530#else /* Thumb-1 */ 531 uint32_t r4, r5, r6, r7; 532 533 __asm__ volatile ( 534 ".syntax unified \n\t" 535 "subs %[r3], #1 \n\t" /* r3 = num_words - 1 */ 536 "lsls %[r3], #2 \n\t" /* r3 = (num_words - 1) * 4 */ 537 "mov r8, %[r3] \n\t" /* r8 = (num_words - 1) * 4 */ 538 "lsls %[r3], #1 \n\t" /* r3 = (num_words - 1) * 8 */ 539 "mov r9, %[r3] \n\t" /* r9 = (num_words - 1) * 8 */ 540 "movs %[r3], #0 \n\t" /* c0 = 0 */ 541 "movs %[r4], #0 \n\t" /* c1 = 0 */ 542 "movs %[r5], #0 \n\t" /* c2 = 0 */ 543 "movs %[r6], #0 \n\t" /* k = 0 */ 544 545 "push {%[r0]} \n\t" /* keep result on the stack */ 546 547 "1: \n\t" /* outer loop (k < num_words) */ 548 "movs %[r7], #0 \n\t" /* r7 = i = 0 */ 549 "b 3f \n\t" 550 551 "2: \n\t" /* outer loop (k >= num_words) */ 552 "movs %[r7], %[r6] \n\t" /* r7 = k */ 553 "mov %[r0], r8 \n\t" /* r0 = (num_words - 1) * 4 */ 554 "subs %[r7], %[r0] \n\t" /* r7 = i = k - (num_words - 1) (times 4) */ 555 556 "3: \n\t" /* inner loop */ 557 "mov r10, %[r3] \n\t" 558 "mov r11, %[r4] \n\t" 559 "mov r12, %[r5] \n\t" 560 "mov r14, %[r6] \n\t" 561 "subs %[r0], %[r6], %[r7] \n\t" /* r0 = k - i */ 562 563 "ldr %[r4], [%[r2], %[r0]] \n\t" /* r4 = right[k - i] */ 564 "ldr %[r0], [%[r1], %[r7]] \n\t" /* r0 = left[i] */ 565 566 "lsrs %[r3], %[r0], #16 \n\t" /* r3 = a1 */ 567 "uxth %[r0], %[r0] \n\t" /* r0 = a0 */ 568 569 "lsrs %[r5], %[r4], #16 \n\t" /* r5 = b1 */ 570 "uxth %[r4], %[r4] \n\t" /* r4 = b0 */ 571 572 "movs %[r6], %[r3] \n\t" /* r6 = a1 */ 573 "muls %[r6], %[r5], %[r6] \n\t" /* r6 = a1 * b1 */ 574 "muls %[r3], %[r4], %[r3] \n\t" /* r3 = b0 * a1 */ 575 "muls %[r5], %[r0], %[r5] \n\t" /* r5 = a0 * b1 */ 576 "muls %[r0], %[r4], %[r0] \n\t" /* r0 = a0 * b0 */ 577 578 /* Add middle terms */ 579 "lsls %[r4], %[r3], #16 \n\t" 580 "lsrs %[r3], %[r3], #16 \n\t" 581 "adds %[r0], %[r4] \n\t" 582 "adcs %[r6], %[r3] \n\t" 583 584 "lsls %[r4], %[r5], #16 \n\t" 585 "lsrs %[r5], %[r5], #16 \n\t" 586 "adds %[r0], %[r4] \n\t" 587 "adcs %[r6], %[r5] \n\t" 588 589 "mov %[r3], r10\n\t" 590 "mov %[r4], r11\n\t" 591 "mov %[r5], r12\n\t" 592 "adds %[r3], %[r0] \n\t" /* add low word to c0 */ 593 "adcs %[r4], %[r6] \n\t" /* add high word to c1, including carry */ 594 "movs %[r0], #0 \n\t" /* r0 = 0 (does not affect carry bit) */ 595 "adcs %[r5], %[r0] \n\t" /* add carry to c2 */ 596 597 "mov %[r6], r14\n\t" /* r6 = k */ 598 599 "adds %[r7], #4 \n\t" /* i += 4 */ 600 "cmp %[r7], r8 \n\t" /* i > (num_words - 1) (times 4)? */ 601 "bgt 4f \n\t" /* if so, exit the loop */ 602 "cmp %[r7], %[r6] \n\t" /* i <= k? */ 603 "ble 3b \n\t" /* if so, continue looping */ 604 605 "4: \n\t" /* end inner loop */ 606 607 "ldr %[r0], [sp, #0] \n\t" /* r0 = result */ 608 609 "str %[r3], [%[r0], %[r6]] \n\t" /* result[k] = c0 */ 610 "mov %[r3], %[r4] \n\t" /* c0 = c1 */ 611 "mov %[r4], %[r5] \n\t" /* c1 = c2 */ 612 "movs %[r5], #0 \n\t" /* c2 = 0 */ 613 "adds %[r6], #4 \n\t" /* k += 4 */ 614 "cmp %[r6], r8 \n\t" /* k <= (num_words - 1) (times 4) ? */ 615 "ble 1b \n\t" /* if so, loop back, start with i = 0 */ 616 "cmp %[r6], r9 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */ 617 "ble 2b \n\t" /* if so, loop back, with i = (k + 1) - num_words */ 618 /* end outer loop */ 619 620 "str %[r3], [%[r0], %[r6]] \n\t" /* result[num_words * 2 - 1] = c0 */ 621 "pop {%[r0]} \n\t" /* pop result off the stack */ 622 623 ".syntax divided \n\t" 624 : [r3] "+l" (num_words), [r4] "=&l" (r4), 625 [r5] "=&l" (r5), [r6] "=&l" (r6), [r7] "=&l" (r7) 626 : [r0] "l" (result), [r1] "l" (left), [r2] "l" (right) 627 : "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 628 ); 629#endif 630} 631#define asm_mult 1 632#endif 633 634#if uECC_SQUARE_FUNC 635#if !asm_square 636uECC_VLI_API void uECC_vli_square(uECC_word_t *result, 637 const uECC_word_t *left, 638 wordcount_t num_words) { 639#if (uECC_PLATFORM != uECC_arm_thumb) 640 uint32_t c0 = 0; 641 uint32_t c1 = 0; 642 uint32_t c2 = 0; 643 uint32_t k = 0; 644 uint32_t i, tt; 645 uint32_t t0, t1; 646 647 __asm__ volatile ( 648 ".syntax unified \n\t" 649 650 "1: \n\t" /* outer loop (k < num_words) */ 651 "movs %[i], #0 \n\t" /* i = 0 */ 652 "b 3f \n\t" 653 654 "2: \n\t" /* outer loop (k >= num_words) */ 655 "movs %[i], %[k] \n\t" /* i = k */ 656 "subs %[i], %[last_word] \n\t" /* i = k - (num_words - 1) (times 4) */ 657 658 "3: \n\t" /* inner loop */ 659 "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */ 660 661 "ldr %[t1], [%[left], %[tt]] \n\t" /* t1 = left[k - i] */ 662 "ldr %[t0], [%[left], %[i]] \n\t" /* t0 = left[i] */ 663 664 "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = left[i] * right[k - i] */ 665 666 "cmp %[i], %[tt] \n\t" /* (i < k - i) ? */ 667 "bge 4f \n\t" /* if i >= k - i, skip */ 668 "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */ 669 "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */ 670 "adcs %[c2], %[c2], #0 \n\t" /* add carry to c2 */ 671 672 "4: \n\t" 673 "adds %[c0], %[c0], %[t0] \n\t" /* add low word to c0 */ 674 "adcs %[c1], %[c1], %[t1] \n\t" /* add high word to c1, including carry */ 675 "adcs %[c2], %[c2], #0 \n\t" /* add carry to c2 */ 676 677 "adds %[i], #4 \n\t" /* i += 4 */ 678 "cmp %[i], %[k] \n\t" /* i >= k? */ 679 "bge 5f \n\t" /* if so, exit the loop */ 680 "subs %[tt], %[k], %[i] \n\t" /* tt = k - i */ 681 "cmp %[i], %[tt] \n\t" /* i <= k - i? */ 682 "ble 3b \n\t" /* if so, continue looping */ 683 684 "5: \n\t" /* end inner loop */ 685 686 "str %[c0], [%[result], %[k]] \n\t" /* result[k] = c0 */ 687 "mov %[c0], %[c1] \n\t" /* c0 = c1 */ 688 "mov %[c1], %[c2] \n\t" /* c1 = c2 */ 689 "movs %[c2], #0 \n\t" /* c2 = 0 */ 690 "adds %[k], #4 \n\t" /* k += 4 */ 691 "cmp %[k], %[last_word] \n\t" /* k <= (num_words - 1) (times 4) ? */ 692 "ble 1b \n\t" /* if so, loop back, start with i = 0 */ 693 "cmp %[k], %[last_word], lsl #1 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */ 694 "ble 2b \n\t" /* if so, loop back, start with i = (k + 1) - num_words */ 695 /* end outer loop */ 696 697 "str %[c0], [%[result], %[k]] \n\t" /* result[num_words * 2 - 1] = c0 */ 698 RESUME_SYNTAX 699 : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2), 700 [k] "+r" (k), [i] "=&r" (i), [tt] "=&r" (tt), [t0] "=&r" (t0), [t1] "=&r" (t1) 701 : [result] "r" (result), [left] "r" (left), [last_word] "r" ((num_words - 1) * 4) 702 : "cc", "memory" 703 ); 704 705#else 706 uint32_t r3, r4, r5, r6, r7; 707 708 __asm__ volatile ( 709 ".syntax unified \n\t" 710 "subs %[r2], #1 \n\t" /* r2 = num_words - 1 */ 711 "lsls %[r2], #2 \n\t" /* r2 = (num_words - 1) * 4 */ 712 "mov r8, %[r2] \n\t" /* r8 = (num_words - 1) * 4 */ 713 "lsls %[r2], #1 \n\t" /* r2 = (num_words - 1) * 8 */ 714 "mov r9, %[r2] \n\t" /* r9 = (num_words - 1) * 8 */ 715 "movs %[r2], #0 \n\t" /* c0 = 0 */ 716 "movs %[r3], #0 \n\t" /* c1 = 0 */ 717 "movs %[r4], #0 \n\t" /* c2 = 0 */ 718 "movs %[r5], #0 \n\t" /* k = 0 */ 719 720 "push {%[r0]} \n\t" /* keep result on the stack */ 721 722 "1: \n\t" /* outer loop (k < num_words) */ 723 "movs %[r6], #0 \n\t" /* r6 = i = 0 */ 724 "b 3f \n\t" 725 726 "2: \n\t" /* outer loop (k >= num_words) */ 727 "movs %[r6], %[r5] \n\t" /* r6 = k */ 728 "mov %[r0], r8 \n\t" /* r0 = (num_words - 1) * 4 */ 729 "subs %[r6], %[r0] \n\t" /* r6 = i = k - (num_words - 1) (times 4) */ 730 731 "3: \n\t" /* inner loop */ 732 "mov r10, %[r2] \n\t" 733 "mov r11, %[r3] \n\t" 734 "mov r12, %[r4] \n\t" 735 "mov r14, %[r5] \n\t" 736 "subs %[r7], %[r5], %[r6] \n\t" /* r7 = k - i */ 737 738 "ldr %[r3], [%[r1], %[r7]] \n\t" /* r3 = left[k - i] */ 739 "ldr %[r0], [%[r1], %[r6]] \n\t" /* r0 = left[i] */ 740 741 "lsrs %[r2], %[r0], #16 \n\t" /* r2 = a1 */ 742 "uxth %[r0], %[r0] \n\t" /* r0 = a0 */ 743 744 "lsrs %[r4], %[r3], #16 \n\t" /* r4 = b1 */ 745 "uxth %[r3], %[r3] \n\t" /* r3 = b0 */ 746 747 "movs %[r5], %[r2] \n\t" /* r5 = a1 */ 748 "muls %[r5], %[r4], %[r5] \n\t" /* r5 = a1 * b1 */ 749 "muls %[r2], %[r3], %[r2] \n\t" /* r2 = b0 * a1 */ 750 "muls %[r4], %[r0], %[r4] \n\t" /* r4 = a0 * b1 */ 751 "muls %[r0], %[r3], %[r0] \n\t" /* r0 = a0 * b0 */ 752 753 /* Add middle terms */ 754 "lsls %[r3], %[r2], #16 \n\t" 755 "lsrs %[r2], %[r2], #16 \n\t" 756 "adds %[r0], %[r3] \n\t" 757 "adcs %[r5], %[r2] \n\t" 758 759 "lsls %[r3], %[r4], #16 \n\t" 760 "lsrs %[r4], %[r4], #16 \n\t" 761 "adds %[r0], %[r3] \n\t" 762 "adcs %[r5], %[r4] \n\t" 763 764 /* Add to acc, doubling if necessary */ 765 "mov %[r2], r10\n\t" 766 "mov %[r3], r11\n\t" 767 "mov %[r4], r12\n\t" 768 769 "cmp %[r6], %[r7] \n\t" /* (i < k - i) ? */ 770 "bge 4f \n\t" /* if i >= k - i, skip */ 771 "movs %[r7], #0 \n\t" /* r7 = 0 */ 772 "adds %[r2], %[r0] \n\t" /* add low word to c0 */ 773 "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */ 774 "adcs %[r4], %[r7] \n\t" /* add carry to c2 */ 775 "4: \n\t" 776 "movs %[r7], #0 \n\t" /* r7 = 0 */ 777 "adds %[r2], %[r0] \n\t" /* add low word to c0 */ 778 "adcs %[r3], %[r5] \n\t" /* add high word to c1, including carry */ 779 "adcs %[r4], %[r7] \n\t" /* add carry to c2 */ 780 781 "mov %[r5], r14\n\t" /* r5 = k */ 782 783 "adds %[r6], #4 \n\t" /* i += 4 */ 784 "cmp %[r6], %[r5] \n\t" /* i >= k? */ 785 "bge 5f \n\t" /* if so, exit the loop */ 786 "subs %[r7], %[r5], %[r6] \n\t" /* r7 = k - i */ 787 "cmp %[r6], %[r7] \n\t" /* i <= k - i? */ 788 "ble 3b \n\t" /* if so, continue looping */ 789 790 "5: \n\t" /* end inner loop */ 791 792 "ldr %[r0], [sp, #0] \n\t" /* r0 = result */ 793 794 "str %[r2], [%[r0], %[r5]] \n\t" /* result[k] = c0 */ 795 "mov %[r2], %[r3] \n\t" /* c0 = c1 */ 796 "mov %[r3], %[r4] \n\t" /* c1 = c2 */ 797 "movs %[r4], #0 \n\t" /* c2 = 0 */ 798 "adds %[r5], #4 \n\t" /* k += 4 */ 799 "cmp %[r5], r8 \n\t" /* k <= (num_words - 1) (times 4) ? */ 800 "ble 1b \n\t" /* if so, loop back, start with i = 0 */ 801 "cmp %[r5], r9 \n\t" /* k <= (num_words * 2 - 2) (times 4) ? */ 802 "ble 2b \n\t" /* if so, loop back, with i = (k + 1) - num_words */ 803 /* end outer loop */ 804 805 "str %[r2], [%[r0], %[r5]] \n\t" /* result[num_words * 2 - 1] = c0 */ 806 "pop {%[r0]} \n\t" /* pop result off the stack */ 807 808 ".syntax divided \n\t" 809 : [r2] "+l" (num_words), [r3] "=&l" (r3), [r4] "=&l" (r4), 810 [r5] "=&l" (r5), [r6] "=&l" (r6), [r7] "=&l" (r7) 811 : [r0] "l" (result), [r1] "l" (left) 812 : "r8", "r9", "r10", "r11", "r12", "r14", "cc", "memory" 813 ); 814#endif 815} 816#define asm_square 1 817#endif 818#endif /* uECC_SQUARE_FUNC */ 819 820#endif /* _UECC_ASM_ARM_H_ */ 821