1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#include <GFp/arm_arch.h> 13 14.text 15#if defined(__thumb2__) 16.syntax unified 17.thumb 18#else 19.code 32 20#endif 21 22.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 23.align 2 24.align 6 25#ifdef __thumb2__ 26.thumb_func __ecp_nistz256_mul_by_2 27#endif 28.align 4 29__ecp_nistz256_mul_by_2: 30 ldr r4,[r1,#0] 31 ldr r5,[r1,#4] 32 ldr r6,[r1,#8] 33 adds r4,r4,r4 @ a[0:7]+=a[0:7], i.e. add with itself 34 ldr r7,[r1,#12] 35 adcs r5,r5,r5 36 ldr r8,[r1,#16] 37 adcs r6,r6,r6 38 ldr r9,[r1,#20] 39 adcs r7,r7,r7 40 ldr r10,[r1,#24] 41 adcs r8,r8,r8 42 ldr r11,[r1,#28] 43 adcs r9,r9,r9 44 adcs r10,r10,r10 45 mov r3,#0 46 adcs r11,r11,r11 47 adc r3,r3,#0 48 49 b Lreduce_by_sub 50 51 52@ void GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], 53@ const BN_ULONG r2[8]); 54.globl _GFp_nistz256_add 55.private_extern _GFp_nistz256_add 56#ifdef __thumb2__ 57.thumb_func _GFp_nistz256_add 58#endif 59.align 4 60_GFp_nistz256_add: 61 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 62 bl __ecp_nistz256_add 63#if __ARM_ARCH__>=5 || !defined(__thumb__) 64 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 65#else 66 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 67 bx lr @ interoperable with Thumb ISA:-) 68#endif 69 70 71#ifdef __thumb2__ 72.thumb_func __ecp_nistz256_add 73#endif 74.align 4 75__ecp_nistz256_add: 76 str lr,[sp,#-4]! @ push lr 77 78 ldr r4,[r1,#0] 79 ldr r5,[r1,#4] 80 ldr r6,[r1,#8] 81 ldr r7,[r1,#12] 82 ldr r8,[r1,#16] 83 ldr r3,[r2,#0] 84 ldr r9,[r1,#20] 85 ldr r12,[r2,#4] 86 ldr r10,[r1,#24] 87 ldr r14,[r2,#8] 88 ldr r11,[r1,#28] 89 ldr r1,[r2,#12] 90 adds r4,r4,r3 91 ldr r3,[r2,#16] 92 adcs r5,r5,r12 93 ldr r12,[r2,#20] 94 adcs r6,r6,r14 95 ldr r14,[r2,#24] 96 adcs r7,r7,r1 97 ldr r1,[r2,#28] 98 adcs r8,r8,r3 99 adcs r9,r9,r12 100 adcs r10,r10,r14 101 mov r3,#0 102 adcs r11,r11,r1 103 adc r3,r3,#0 104 ldr lr,[sp],#4 @ pop lr 105 106Lreduce_by_sub: 107 108 @ if a+b >= modulus, subtract modulus. 109 @ 110 @ But since comparison implies subtraction, we subtract 111 @ modulus and then add it back if subtraction borrowed. 112 113 subs r4,r4,#-1 114 sbcs r5,r5,#-1 115 sbcs r6,r6,#-1 116 sbcs r7,r7,#0 117 sbcs r8,r8,#0 118 sbcs r9,r9,#0 119 sbcs r10,r10,#1 120 sbcs r11,r11,#-1 121 sbc r3,r3,#0 122 123 @ Note that because mod has special form, i.e. consists of 124 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 125 @ using value of borrow as a whole or extracting single bit. 126 @ Follow r3 register... 127 128 adds r4,r4,r3 @ add synthesized modulus 129 adcs r5,r5,r3 130 str r4,[r0,#0] 131 adcs r6,r6,r3 132 str r5,[r0,#4] 133 adcs r7,r7,#0 134 str r6,[r0,#8] 135 adcs r8,r8,#0 136 str r7,[r0,#12] 137 adcs r9,r9,#0 138 str r8,[r0,#16] 139 adcs r10,r10,r3,lsr#31 140 str r9,[r0,#20] 141 adcs r11,r11,r3 142 str r10,[r0,#24] 143 str r11,[r0,#28] 144 145 mov pc,lr 146 147 148#ifdef __thumb2__ 149.thumb_func __ecp_nistz256_mul_by_3 150#endif 151.align 4 152__ecp_nistz256_mul_by_3: 153 str lr,[sp,#-4]! @ push lr 154 155 @ As multiplication by 3 is performed as 2*n+n, below are inline 156 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see 157 @ corresponding subroutines for details. 158 159 ldr r4,[r1,#0] 160 ldr r5,[r1,#4] 161 ldr r6,[r1,#8] 162 adds r4,r4,r4 @ a[0:7]+=a[0:7] 163 ldr r7,[r1,#12] 164 adcs r5,r5,r5 165 ldr r8,[r1,#16] 166 adcs r6,r6,r6 167 ldr r9,[r1,#20] 168 adcs r7,r7,r7 169 ldr r10,[r1,#24] 170 adcs r8,r8,r8 171 ldr r11,[r1,#28] 172 adcs r9,r9,r9 173 adcs r10,r10,r10 174 mov r3,#0 175 adcs r11,r11,r11 176 adc r3,r3,#0 177 178 subs r4,r4,#-1 @ Lreduce_by_sub but without stores 179 sbcs r5,r5,#-1 180 sbcs r6,r6,#-1 181 sbcs r7,r7,#0 182 sbcs r8,r8,#0 183 sbcs r9,r9,#0 184 sbcs r10,r10,#1 185 sbcs r11,r11,#-1 186 sbc r3,r3,#0 187 188 adds r4,r4,r3 @ add synthesized modulus 189 adcs r5,r5,r3 190 adcs r6,r6,r3 191 adcs r7,r7,#0 192 adcs r8,r8,#0 193 ldr r2,[r1,#0] 194 adcs r9,r9,#0 195 ldr r12,[r1,#4] 196 adcs r10,r10,r3,lsr#31 197 ldr r14,[r1,#8] 198 adc r11,r11,r3 199 200 ldr r3,[r1,#12] 201 adds r4,r4,r2 @ 2*a[0:7]+=a[0:7] 202 ldr r2,[r1,#16] 203 adcs r5,r5,r12 204 ldr r12,[r1,#20] 205 adcs r6,r6,r14 206 ldr r14,[r1,#24] 207 adcs r7,r7,r3 208 ldr r1,[r1,#28] 209 adcs r8,r8,r2 210 adcs r9,r9,r12 211 adcs r10,r10,r14 212 mov r3,#0 213 adcs r11,r11,r1 214 adc r3,r3,#0 215 ldr lr,[sp],#4 @ pop lr 216 217 b Lreduce_by_sub 218 219 220#ifdef __thumb2__ 221.thumb_func __ecp_nistz256_div_by_2 222#endif 223.align 4 224__ecp_nistz256_div_by_2: 225 @ ret = (a is odd ? a+mod : a) >> 1 226 227 ldr r4,[r1,#0] 228 ldr r5,[r1,#4] 229 ldr r6,[r1,#8] 230 mov r3,r4,lsl#31 @ place least significant bit to most 231 @ significant position, now arithmetic 232 @ right shift by 31 will produce -1 or 233 @ 0, while logical right shift 1 or 0, 234 @ this is how modulus is conditionally 235 @ synthesized in this case... 236 ldr r7,[r1,#12] 237 adds r4,r4,r3,asr#31 238 ldr r8,[r1,#16] 239 adcs r5,r5,r3,asr#31 240 ldr r9,[r1,#20] 241 adcs r6,r6,r3,asr#31 242 ldr r10,[r1,#24] 243 adcs r7,r7,#0 244 ldr r11,[r1,#28] 245 adcs r8,r8,#0 246 mov r4,r4,lsr#1 @ a[0:7]>>=1, we can start early 247 @ because it doesn't affect flags 248 adcs r9,r9,#0 249 orr r4,r4,r5,lsl#31 250 adcs r10,r10,r3,lsr#31 251 mov r2,#0 252 adcs r11,r11,r3,asr#31 253 mov r5,r5,lsr#1 254 adc r2,r2,#0 @ top-most carry bit from addition 255 256 orr r5,r5,r6,lsl#31 257 mov r6,r6,lsr#1 258 str r4,[r0,#0] 259 orr r6,r6,r7,lsl#31 260 mov r7,r7,lsr#1 261 str r5,[r0,#4] 262 orr r7,r7,r8,lsl#31 263 mov r8,r8,lsr#1 264 str r6,[r0,#8] 265 orr r8,r8,r9,lsl#31 266 mov r9,r9,lsr#1 267 str r7,[r0,#12] 268 orr r9,r9,r10,lsl#31 269 mov r10,r10,lsr#1 270 str r8,[r0,#16] 271 orr r10,r10,r11,lsl#31 272 mov r11,r11,lsr#1 273 str r9,[r0,#20] 274 orr r11,r11,r2,lsl#31 @ don't forget the top-most carry bit 275 str r10,[r0,#24] 276 str r11,[r0,#28] 277 278 mov pc,lr 279 280 281#ifdef __thumb2__ 282.thumb_func __ecp_nistz256_sub 283#endif 284.align 4 285__ecp_nistz256_sub: 286 str lr,[sp,#-4]! @ push lr 287 288 ldr r4,[r1,#0] 289 ldr r5,[r1,#4] 290 ldr r6,[r1,#8] 291 ldr r7,[r1,#12] 292 ldr r8,[r1,#16] 293 ldr r3,[r2,#0] 294 ldr r9,[r1,#20] 295 ldr r12,[r2,#4] 296 ldr r10,[r1,#24] 297 ldr r14,[r2,#8] 298 ldr r11,[r1,#28] 299 ldr r1,[r2,#12] 300 subs r4,r4,r3 301 ldr r3,[r2,#16] 302 sbcs r5,r5,r12 303 ldr r12,[r2,#20] 304 sbcs r6,r6,r14 305 ldr r14,[r2,#24] 306 sbcs r7,r7,r1 307 ldr r1,[r2,#28] 308 sbcs r8,r8,r3 309 sbcs r9,r9,r12 310 sbcs r10,r10,r14 311 sbcs r11,r11,r1 312 sbc r3,r3,r3 @ broadcast borrow bit 313 ldr lr,[sp],#4 @ pop lr 314 315Lreduce_by_add: 316 317 @ if a-b borrows, add modulus. 318 @ 319 @ Note that because mod has special form, i.e. consists of 320 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 321 @ broadcasting borrow bit to a register, r3, and using it as 322 @ a whole or extracting single bit. 323 324 adds r4,r4,r3 @ add synthesized modulus 325 adcs r5,r5,r3 326 str r4,[r0,#0] 327 adcs r6,r6,r3 328 str r5,[r0,#4] 329 adcs r7,r7,#0 330 str r6,[r0,#8] 331 adcs r8,r8,#0 332 str r7,[r0,#12] 333 adcs r9,r9,#0 334 str r8,[r0,#16] 335 adcs r10,r10,r3,lsr#31 336 str r9,[r0,#20] 337 adcs r11,r11,r3 338 str r10,[r0,#24] 339 str r11,[r0,#28] 340 341 mov pc,lr 342 343 344@ void GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); 345.globl _GFp_nistz256_neg 346.private_extern _GFp_nistz256_neg 347#ifdef __thumb2__ 348.thumb_func _GFp_nistz256_neg 349#endif 350.align 4 351_GFp_nistz256_neg: 352 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 353 bl __ecp_nistz256_neg 354#if __ARM_ARCH__>=5 || !defined(__thumb__) 355 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 356#else 357 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 358 bx lr @ interoperable with Thumb ISA:-) 359#endif 360 361 362#ifdef __thumb2__ 363.thumb_func __ecp_nistz256_neg 364#endif 365.align 4 366__ecp_nistz256_neg: 367 ldr r4,[r1,#0] 368 eor r3,r3,r3 369 ldr r5,[r1,#4] 370 ldr r6,[r1,#8] 371 subs r4,r3,r4 372 ldr r7,[r1,#12] 373 sbcs r5,r3,r5 374 ldr r8,[r1,#16] 375 sbcs r6,r3,r6 376 ldr r9,[r1,#20] 377 sbcs r7,r3,r7 378 ldr r10,[r1,#24] 379 sbcs r8,r3,r8 380 ldr r11,[r1,#28] 381 sbcs r9,r3,r9 382 sbcs r10,r3,r10 383 sbcs r11,r3,r11 384 sbc r3,r3,r3 385 386 b Lreduce_by_add 387 388@ void GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], 389@ const BN_ULONG r2[8]); 390.globl _GFp_nistz256_mul_mont 391.private_extern _GFp_nistz256_mul_mont 392#ifdef __thumb2__ 393.thumb_func _GFp_nistz256_mul_mont 394#endif 395.align 4 396_GFp_nistz256_mul_mont: 397 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 398 bl __ecp_nistz256_mul_mont 399#if __ARM_ARCH__>=5 || !defined(__thumb__) 400 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 401#else 402 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 403 bx lr @ interoperable with Thumb ISA:-) 404#endif 405 406 407#ifdef __thumb2__ 408.thumb_func __ecp_nistz256_mul_mont 409#endif 410.align 4 411__ecp_nistz256_mul_mont: 412 stmdb sp!,{r0,r1,r2,lr} @ make a copy of arguments too 413 414 ldr r2,[r2,#0] @ b[0] 415 ldmia r1,{r4,r5,r6,r7,r8,r9,r10,r11} 416 417 umull r3,r14,r4,r2 @ r[0]=a[0]*b[0] 418 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy a[0-7] to stack, so 419 @ that it can be addressed 420 @ without spending register 421 @ on address 422 umull r4,r0,r5,r2 @ r[1]=a[1]*b[0] 423 umull r5,r1,r6,r2 424 adds r4,r4,r14 @ accumulate high part of mult 425 umull r6,r12,r7,r2 426 adcs r5,r5,r0 427 umull r7,r14,r8,r2 428 adcs r6,r6,r1 429 umull r8,r0,r9,r2 430 adcs r7,r7,r12 431 umull r9,r1,r10,r2 432 adcs r8,r8,r14 433 umull r10,r12,r11,r2 434 adcs r9,r9,r0 435 adcs r10,r10,r1 436 eor r14,r14,r14 @ first overflow bit is zero 437 adc r11,r12,#0 438 @ multiplication-less reduction 1 439 adds r6,r6,r3 @ r[3]+=r[0] 440 ldr r2,[sp,#40] @ restore b_ptr 441 adcs r7,r7,#0 @ r[4]+=0 442 adcs r8,r8,#0 @ r[5]+=0 443 adcs r9,r9,r3 @ r[6]+=r[0] 444 ldr r1,[sp,#0] @ load a[0] 445 adcs r10,r10,#0 @ r[7]+=0 446 ldr r2,[r2,#4*1] @ load b[i] 447 adcs r11,r11,r3 @ r[8]+=r[0] 448 eor r0,r0,r0 449 adc r14,r14,#0 @ overflow bit 450 subs r10,r10,r3 @ r[7]-=r[0] 451 ldr r12,[sp,#4] @ a[1] 452 sbcs r11,r11,#0 @ r[8]-=0 453 umlal r4,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 454 eor r1,r1,r1 455 sbc r3,r14,#0 @ overflow bit, keep in mind 456 @ that netto result is 457 @ addition of a value which 458 @ makes underflow impossible 459 460 ldr r14,[sp,#8] @ a[2] 461 umlal r5,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 462 str r3,[sp,#36] @ temporarily offload overflow 463 eor r12,r12,r12 464 ldr r3,[sp,#12] @ a[3], r3 is alias r3 465 umlal r6,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 466 eor r14,r14,r14 467 adds r5,r5,r0 @ accumulate high part of mult 468 ldr r0,[sp,#16] @ a[4] 469 umlal r7,r14,r3,r2 @ "r[3]"+=a[3]*b[i] 470 eor r3,r3,r3 471 adcs r6,r6,r1 472 ldr r1,[sp,#20] @ a[5] 473 umlal r8,r3,r0,r2 @ "r[4]"+=a[4]*b[i] 474 eor r0,r0,r0 475 adcs r7,r7,r12 476 ldr r12,[sp,#24] @ a[6] 477 umlal r9,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 478 eor r1,r1,r1 479 adcs r8,r8,r14 480 ldr r14,[sp,#28] @ a[7] 481 umlal r10,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 482 eor r12,r12,r12 483 adcs r9,r9,r3 484 ldr r3,[sp,#36] @ restore overflow bit 485 umlal r11,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 486 eor r14,r14,r14 487 adcs r10,r10,r0 488 adcs r11,r11,r1 489 adcs r3,r3,r12 490 adc r14,r14,#0 @ new overflow bit 491 @ multiplication-less reduction 2 492 adds r7,r7,r4 @ r[3]+=r[0] 493 ldr r2,[sp,#40] @ restore b_ptr 494 adcs r8,r8,#0 @ r[4]+=0 495 adcs r9,r9,#0 @ r[5]+=0 496 adcs r10,r10,r4 @ r[6]+=r[0] 497 ldr r1,[sp,#0] @ load a[0] 498 adcs r11,r11,#0 @ r[7]+=0 499 ldr r2,[r2,#4*2] @ load b[i] 500 adcs r3,r3,r4 @ r[8]+=r[0] 501 eor r0,r0,r0 502 adc r14,r14,#0 @ overflow bit 503 subs r11,r11,r4 @ r[7]-=r[0] 504 ldr r12,[sp,#4] @ a[1] 505 sbcs r3,r3,#0 @ r[8]-=0 506 umlal r5,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 507 eor r1,r1,r1 508 sbc r4,r14,#0 @ overflow bit, keep in mind 509 @ that netto result is 510 @ addition of a value which 511 @ makes underflow impossible 512 513 ldr r14,[sp,#8] @ a[2] 514 umlal r6,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 515 str r4,[sp,#36] @ temporarily offload overflow 516 eor r12,r12,r12 517 ldr r4,[sp,#12] @ a[3], r4 is alias r4 518 umlal r7,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 519 eor r14,r14,r14 520 adds r6,r6,r0 @ accumulate high part of mult 521 ldr r0,[sp,#16] @ a[4] 522 umlal r8,r14,r4,r2 @ "r[3]"+=a[3]*b[i] 523 eor r4,r4,r4 524 adcs r7,r7,r1 525 ldr r1,[sp,#20] @ a[5] 526 umlal r9,r4,r0,r2 @ "r[4]"+=a[4]*b[i] 527 eor r0,r0,r0 528 adcs r8,r8,r12 529 ldr r12,[sp,#24] @ a[6] 530 umlal r10,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 531 eor r1,r1,r1 532 adcs r9,r9,r14 533 ldr r14,[sp,#28] @ a[7] 534 umlal r11,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 535 eor r12,r12,r12 536 adcs r10,r10,r4 537 ldr r4,[sp,#36] @ restore overflow bit 538 umlal r3,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 539 eor r14,r14,r14 540 adcs r11,r11,r0 541 adcs r3,r3,r1 542 adcs r4,r4,r12 543 adc r14,r14,#0 @ new overflow bit 544 @ multiplication-less reduction 3 545 adds r8,r8,r5 @ r[3]+=r[0] 546 ldr r2,[sp,#40] @ restore b_ptr 547 adcs r9,r9,#0 @ r[4]+=0 548 adcs r10,r10,#0 @ r[5]+=0 549 adcs r11,r11,r5 @ r[6]+=r[0] 550 ldr r1,[sp,#0] @ load a[0] 551 adcs r3,r3,#0 @ r[7]+=0 552 ldr r2,[r2,#4*3] @ load b[i] 553 adcs r4,r4,r5 @ r[8]+=r[0] 554 eor r0,r0,r0 555 adc r14,r14,#0 @ overflow bit 556 subs r3,r3,r5 @ r[7]-=r[0] 557 ldr r12,[sp,#4] @ a[1] 558 sbcs r4,r4,#0 @ r[8]-=0 559 umlal r6,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 560 eor r1,r1,r1 561 sbc r5,r14,#0 @ overflow bit, keep in mind 562 @ that netto result is 563 @ addition of a value which 564 @ makes underflow impossible 565 566 ldr r14,[sp,#8] @ a[2] 567 umlal r7,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 568 str r5,[sp,#36] @ temporarily offload overflow 569 eor r12,r12,r12 570 ldr r5,[sp,#12] @ a[3], r5 is alias r5 571 umlal r8,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 572 eor r14,r14,r14 573 adds r7,r7,r0 @ accumulate high part of mult 574 ldr r0,[sp,#16] @ a[4] 575 umlal r9,r14,r5,r2 @ "r[3]"+=a[3]*b[i] 576 eor r5,r5,r5 577 adcs r8,r8,r1 578 ldr r1,[sp,#20] @ a[5] 579 umlal r10,r5,r0,r2 @ "r[4]"+=a[4]*b[i] 580 eor r0,r0,r0 581 adcs r9,r9,r12 582 ldr r12,[sp,#24] @ a[6] 583 umlal r11,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 584 eor r1,r1,r1 585 adcs r10,r10,r14 586 ldr r14,[sp,#28] @ a[7] 587 umlal r3,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 588 eor r12,r12,r12 589 adcs r11,r11,r5 590 ldr r5,[sp,#36] @ restore overflow bit 591 umlal r4,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 592 eor r14,r14,r14 593 adcs r3,r3,r0 594 adcs r4,r4,r1 595 adcs r5,r5,r12 596 adc r14,r14,#0 @ new overflow bit 597 @ multiplication-less reduction 4 598 adds r9,r9,r6 @ r[3]+=r[0] 599 ldr r2,[sp,#40] @ restore b_ptr 600 adcs r10,r10,#0 @ r[4]+=0 601 adcs r11,r11,#0 @ r[5]+=0 602 adcs r3,r3,r6 @ r[6]+=r[0] 603 ldr r1,[sp,#0] @ load a[0] 604 adcs r4,r4,#0 @ r[7]+=0 605 ldr r2,[r2,#4*4] @ load b[i] 606 adcs r5,r5,r6 @ r[8]+=r[0] 607 eor r0,r0,r0 608 adc r14,r14,#0 @ overflow bit 609 subs r4,r4,r6 @ r[7]-=r[0] 610 ldr r12,[sp,#4] @ a[1] 611 sbcs r5,r5,#0 @ r[8]-=0 612 umlal r7,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 613 eor r1,r1,r1 614 sbc r6,r14,#0 @ overflow bit, keep in mind 615 @ that netto result is 616 @ addition of a value which 617 @ makes underflow impossible 618 619 ldr r14,[sp,#8] @ a[2] 620 umlal r8,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 621 str r6,[sp,#36] @ temporarily offload overflow 622 eor r12,r12,r12 623 ldr r6,[sp,#12] @ a[3], r6 is alias r6 624 umlal r9,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 625 eor r14,r14,r14 626 adds r8,r8,r0 @ accumulate high part of mult 627 ldr r0,[sp,#16] @ a[4] 628 umlal r10,r14,r6,r2 @ "r[3]"+=a[3]*b[i] 629 eor r6,r6,r6 630 adcs r9,r9,r1 631 ldr r1,[sp,#20] @ a[5] 632 umlal r11,r6,r0,r2 @ "r[4]"+=a[4]*b[i] 633 eor r0,r0,r0 634 adcs r10,r10,r12 635 ldr r12,[sp,#24] @ a[6] 636 umlal r3,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 637 eor r1,r1,r1 638 adcs r11,r11,r14 639 ldr r14,[sp,#28] @ a[7] 640 umlal r4,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 641 eor r12,r12,r12 642 adcs r3,r3,r6 643 ldr r6,[sp,#36] @ restore overflow bit 644 umlal r5,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 645 eor r14,r14,r14 646 adcs r4,r4,r0 647 adcs r5,r5,r1 648 adcs r6,r6,r12 649 adc r14,r14,#0 @ new overflow bit 650 @ multiplication-less reduction 5 651 adds r10,r10,r7 @ r[3]+=r[0] 652 ldr r2,[sp,#40] @ restore b_ptr 653 adcs r11,r11,#0 @ r[4]+=0 654 adcs r3,r3,#0 @ r[5]+=0 655 adcs r4,r4,r7 @ r[6]+=r[0] 656 ldr r1,[sp,#0] @ load a[0] 657 adcs r5,r5,#0 @ r[7]+=0 658 ldr r2,[r2,#4*5] @ load b[i] 659 adcs r6,r6,r7 @ r[8]+=r[0] 660 eor r0,r0,r0 661 adc r14,r14,#0 @ overflow bit 662 subs r5,r5,r7 @ r[7]-=r[0] 663 ldr r12,[sp,#4] @ a[1] 664 sbcs r6,r6,#0 @ r[8]-=0 665 umlal r8,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 666 eor r1,r1,r1 667 sbc r7,r14,#0 @ overflow bit, keep in mind 668 @ that netto result is 669 @ addition of a value which 670 @ makes underflow impossible 671 672 ldr r14,[sp,#8] @ a[2] 673 umlal r9,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 674 str r7,[sp,#36] @ temporarily offload overflow 675 eor r12,r12,r12 676 ldr r7,[sp,#12] @ a[3], r7 is alias r7 677 umlal r10,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 678 eor r14,r14,r14 679 adds r9,r9,r0 @ accumulate high part of mult 680 ldr r0,[sp,#16] @ a[4] 681 umlal r11,r14,r7,r2 @ "r[3]"+=a[3]*b[i] 682 eor r7,r7,r7 683 adcs r10,r10,r1 684 ldr r1,[sp,#20] @ a[5] 685 umlal r3,r7,r0,r2 @ "r[4]"+=a[4]*b[i] 686 eor r0,r0,r0 687 adcs r11,r11,r12 688 ldr r12,[sp,#24] @ a[6] 689 umlal r4,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 690 eor r1,r1,r1 691 adcs r3,r3,r14 692 ldr r14,[sp,#28] @ a[7] 693 umlal r5,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 694 eor r12,r12,r12 695 adcs r4,r4,r7 696 ldr r7,[sp,#36] @ restore overflow bit 697 umlal r6,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 698 eor r14,r14,r14 699 adcs r5,r5,r0 700 adcs r6,r6,r1 701 adcs r7,r7,r12 702 adc r14,r14,#0 @ new overflow bit 703 @ multiplication-less reduction 6 704 adds r11,r11,r8 @ r[3]+=r[0] 705 ldr r2,[sp,#40] @ restore b_ptr 706 adcs r3,r3,#0 @ r[4]+=0 707 adcs r4,r4,#0 @ r[5]+=0 708 adcs r5,r5,r8 @ r[6]+=r[0] 709 ldr r1,[sp,#0] @ load a[0] 710 adcs r6,r6,#0 @ r[7]+=0 711 ldr r2,[r2,#4*6] @ load b[i] 712 adcs r7,r7,r8 @ r[8]+=r[0] 713 eor r0,r0,r0 714 adc r14,r14,#0 @ overflow bit 715 subs r6,r6,r8 @ r[7]-=r[0] 716 ldr r12,[sp,#4] @ a[1] 717 sbcs r7,r7,#0 @ r[8]-=0 718 umlal r9,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 719 eor r1,r1,r1 720 sbc r8,r14,#0 @ overflow bit, keep in mind 721 @ that netto result is 722 @ addition of a value which 723 @ makes underflow impossible 724 725 ldr r14,[sp,#8] @ a[2] 726 umlal r10,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 727 str r8,[sp,#36] @ temporarily offload overflow 728 eor r12,r12,r12 729 ldr r8,[sp,#12] @ a[3], r8 is alias r8 730 umlal r11,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 731 eor r14,r14,r14 732 adds r10,r10,r0 @ accumulate high part of mult 733 ldr r0,[sp,#16] @ a[4] 734 umlal r3,r14,r8,r2 @ "r[3]"+=a[3]*b[i] 735 eor r8,r8,r8 736 adcs r11,r11,r1 737 ldr r1,[sp,#20] @ a[5] 738 umlal r4,r8,r0,r2 @ "r[4]"+=a[4]*b[i] 739 eor r0,r0,r0 740 adcs r3,r3,r12 741 ldr r12,[sp,#24] @ a[6] 742 umlal r5,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 743 eor r1,r1,r1 744 adcs r4,r4,r14 745 ldr r14,[sp,#28] @ a[7] 746 umlal r6,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 747 eor r12,r12,r12 748 adcs r5,r5,r8 749 ldr r8,[sp,#36] @ restore overflow bit 750 umlal r7,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 751 eor r14,r14,r14 752 adcs r6,r6,r0 753 adcs r7,r7,r1 754 adcs r8,r8,r12 755 adc r14,r14,#0 @ new overflow bit 756 @ multiplication-less reduction 7 757 adds r3,r3,r9 @ r[3]+=r[0] 758 ldr r2,[sp,#40] @ restore b_ptr 759 adcs r4,r4,#0 @ r[4]+=0 760 adcs r5,r5,#0 @ r[5]+=0 761 adcs r6,r6,r9 @ r[6]+=r[0] 762 ldr r1,[sp,#0] @ load a[0] 763 adcs r7,r7,#0 @ r[7]+=0 764 ldr r2,[r2,#4*7] @ load b[i] 765 adcs r8,r8,r9 @ r[8]+=r[0] 766 eor r0,r0,r0 767 adc r14,r14,#0 @ overflow bit 768 subs r7,r7,r9 @ r[7]-=r[0] 769 ldr r12,[sp,#4] @ a[1] 770 sbcs r8,r8,#0 @ r[8]-=0 771 umlal r10,r0,r1,r2 @ "r[0]"+=a[0]*b[i] 772 eor r1,r1,r1 773 sbc r9,r14,#0 @ overflow bit, keep in mind 774 @ that netto result is 775 @ addition of a value which 776 @ makes underflow impossible 777 778 ldr r14,[sp,#8] @ a[2] 779 umlal r11,r1,r12,r2 @ "r[1]"+=a[1]*b[i] 780 str r9,[sp,#36] @ temporarily offload overflow 781 eor r12,r12,r12 782 ldr r9,[sp,#12] @ a[3], r9 is alias r9 783 umlal r3,r12,r14,r2 @ "r[2]"+=a[2]*b[i] 784 eor r14,r14,r14 785 adds r11,r11,r0 @ accumulate high part of mult 786 ldr r0,[sp,#16] @ a[4] 787 umlal r4,r14,r9,r2 @ "r[3]"+=a[3]*b[i] 788 eor r9,r9,r9 789 adcs r3,r3,r1 790 ldr r1,[sp,#20] @ a[5] 791 umlal r5,r9,r0,r2 @ "r[4]"+=a[4]*b[i] 792 eor r0,r0,r0 793 adcs r4,r4,r12 794 ldr r12,[sp,#24] @ a[6] 795 umlal r6,r0,r1,r2 @ "r[5]"+=a[5]*b[i] 796 eor r1,r1,r1 797 adcs r5,r5,r14 798 ldr r14,[sp,#28] @ a[7] 799 umlal r7,r1,r12,r2 @ "r[6]"+=a[6]*b[i] 800 eor r12,r12,r12 801 adcs r6,r6,r9 802 ldr r9,[sp,#36] @ restore overflow bit 803 umlal r8,r12,r14,r2 @ "r[7]"+=a[7]*b[i] 804 eor r14,r14,r14 805 adcs r7,r7,r0 806 adcs r8,r8,r1 807 adcs r9,r9,r12 808 adc r14,r14,#0 @ new overflow bit 809 @ last multiplication-less reduction 810 adds r4,r4,r10 811 ldr r0,[sp,#32] @ restore r_ptr 812 adcs r5,r5,#0 813 adcs r6,r6,#0 814 adcs r7,r7,r10 815 adcs r8,r8,#0 816 adcs r9,r9,r10 817 adc r14,r14,#0 818 subs r8,r8,r10 819 sbcs r9,r9,#0 820 sbc r10,r14,#0 @ overflow bit 821 822 @ Final step is "if result > mod, subtract mod", but we do it 823 @ "other way around", namely subtract modulus from result 824 @ and if it borrowed, add modulus back. 825 826 adds r11,r11,#1 @ subs r11,r11,#-1 827 adcs r3,r3,#0 @ sbcs r3,r3,#-1 828 adcs r4,r4,#0 @ sbcs r4,r4,#-1 829 sbcs r5,r5,#0 830 sbcs r6,r6,#0 831 sbcs r7,r7,#0 832 sbcs r8,r8,#1 833 adcs r9,r9,#0 @ sbcs r9,r9,#-1 834 ldr lr,[sp,#44] @ restore lr 835 sbc r10,r10,#0 @ broadcast borrow bit 836 add sp,sp,#48 837 838 @ Note that because mod has special form, i.e. consists of 839 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 840 @ broadcasting borrow bit to a register, r10, and using it as 841 @ a whole or extracting single bit. 842 843 adds r11,r11,r10 @ add modulus or zero 844 adcs r3,r3,r10 845 str r11,[r0,#0] 846 adcs r4,r4,r10 847 str r3,[r0,#4] 848 adcs r5,r5,#0 849 str r4,[r0,#8] 850 adcs r6,r6,#0 851 str r5,[r0,#12] 852 adcs r7,r7,#0 853 str r6,[r0,#16] 854 adcs r8,r8,r10,lsr#31 855 str r7,[r0,#20] 856 adc r9,r9,r10 857 str r8,[r0,#24] 858 str r9,[r0,#28] 859 860 mov pc,lr 861 862#ifdef __thumb2__ 863.thumb_func __ecp_nistz256_sub_from 864#endif 865.align 5 866__ecp_nistz256_sub_from: 867 str lr,[sp,#-4]! @ push lr 868 869 ldr r10,[r2,#0] 870 ldr r12,[r2,#4] 871 ldr r14,[r2,#8] 872 ldr r1,[r2,#12] 873 subs r11,r11,r10 874 ldr r10,[r2,#16] 875 sbcs r3,r3,r12 876 ldr r12,[r2,#20] 877 sbcs r4,r4,r14 878 ldr r14,[r2,#24] 879 sbcs r5,r5,r1 880 ldr r1,[r2,#28] 881 sbcs r6,r6,r10 882 sbcs r7,r7,r12 883 sbcs r8,r8,r14 884 sbcs r9,r9,r1 885 sbc r2,r2,r2 @ broadcast borrow bit 886 ldr lr,[sp],#4 @ pop lr 887 888 adds r11,r11,r2 @ add synthesized modulus 889 adcs r3,r3,r2 890 str r11,[r0,#0] 891 adcs r4,r4,r2 892 str r3,[r0,#4] 893 adcs r5,r5,#0 894 str r4,[r0,#8] 895 adcs r6,r6,#0 896 str r5,[r0,#12] 897 adcs r7,r7,#0 898 str r6,[r0,#16] 899 adcs r8,r8,r2,lsr#31 900 str r7,[r0,#20] 901 adcs r9,r9,r2 902 str r8,[r0,#24] 903 str r9,[r0,#28] 904 905 mov pc,lr 906 907 908#ifdef __thumb2__ 909.thumb_func __ecp_nistz256_sub_morf 910#endif 911.align 5 912__ecp_nistz256_sub_morf: 913 str lr,[sp,#-4]! @ push lr 914 915 ldr r10,[r2,#0] 916 ldr r12,[r2,#4] 917 ldr r14,[r2,#8] 918 ldr r1,[r2,#12] 919 subs r11,r10,r11 920 ldr r10,[r2,#16] 921 sbcs r3,r12,r3 922 ldr r12,[r2,#20] 923 sbcs r4,r14,r4 924 ldr r14,[r2,#24] 925 sbcs r5,r1,r5 926 ldr r1,[r2,#28] 927 sbcs r6,r10,r6 928 sbcs r7,r12,r7 929 sbcs r8,r14,r8 930 sbcs r9,r1,r9 931 sbc r2,r2,r2 @ broadcast borrow bit 932 ldr lr,[sp],#4 @ pop lr 933 934 adds r11,r11,r2 @ add synthesized modulus 935 adcs r3,r3,r2 936 str r11,[r0,#0] 937 adcs r4,r4,r2 938 str r3,[r0,#4] 939 adcs r5,r5,#0 940 str r4,[r0,#8] 941 adcs r6,r6,#0 942 str r5,[r0,#12] 943 adcs r7,r7,#0 944 str r6,[r0,#16] 945 adcs r8,r8,r2,lsr#31 946 str r7,[r0,#20] 947 adcs r9,r9,r2 948 str r8,[r0,#24] 949 str r9,[r0,#28] 950 951 mov pc,lr 952 953 954#ifdef __thumb2__ 955.thumb_func __ecp_nistz256_add_self 956#endif 957.align 4 958__ecp_nistz256_add_self: 959 adds r11,r11,r11 @ a[0:7]+=a[0:7] 960 adcs r3,r3,r3 961 adcs r4,r4,r4 962 adcs r5,r5,r5 963 adcs r6,r6,r6 964 adcs r7,r7,r7 965 adcs r8,r8,r8 966 mov r2,#0 967 adcs r9,r9,r9 968 adc r2,r2,#0 969 970 @ if a+b >= modulus, subtract modulus. 971 @ 972 @ But since comparison implies subtraction, we subtract 973 @ modulus and then add it back if subtraction borrowed. 974 975 subs r11,r11,#-1 976 sbcs r3,r3,#-1 977 sbcs r4,r4,#-1 978 sbcs r5,r5,#0 979 sbcs r6,r6,#0 980 sbcs r7,r7,#0 981 sbcs r8,r8,#1 982 sbcs r9,r9,#-1 983 sbc r2,r2,#0 984 985 @ Note that because mod has special form, i.e. consists of 986 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 987 @ using value of borrow as a whole or extracting single bit. 988 @ Follow r2 register... 989 990 adds r11,r11,r2 @ add synthesized modulus 991 adcs r3,r3,r2 992 str r11,[r0,#0] 993 adcs r4,r4,r2 994 str r3,[r0,#4] 995 adcs r5,r5,#0 996 str r4,[r0,#8] 997 adcs r6,r6,#0 998 str r5,[r0,#12] 999 adcs r7,r7,#0 1000 str r6,[r0,#16] 1001 adcs r8,r8,r2,lsr#31 1002 str r7,[r0,#20] 1003 adcs r9,r9,r2 1004 str r8,[r0,#24] 1005 str r9,[r0,#28] 1006 1007 mov pc,lr 1008 1009 1010.globl _GFp_nistz256_point_double 1011.private_extern _GFp_nistz256_point_double 1012#ifdef __thumb2__ 1013.thumb_func _GFp_nistz256_point_double 1014#endif 1015.align 5 1016_GFp_nistz256_point_double: 1017 stmdb sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ push from r0, unusual, but intentional 1018 sub sp,sp,#32*5 1019 1020Lpoint_double_shortcut: 1021 add r3,sp,#96 1022 ldmia r1!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy in_x 1023 stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} 1024 1025 add r0,sp,#0 1026 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); 1027 1028 add r2,r1,#32 1029 add r1,r1,#32 1030 add r0,sp,#64 1031 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); 1032 1033 add r1,sp,#0 1034 add r2,sp,#0 1035 add r0,sp,#0 1036 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); 1037 1038 ldr r2,[sp,#32*5+4] 1039 add r1,r2,#32 1040 add r2,r2,#64 1041 add r0,sp,#128 1042 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); 1043 1044 ldr r0,[sp,#32*5] 1045 add r0,r0,#64 1046 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); 1047 1048 add r1,sp,#96 1049 add r2,sp,#64 1050 add r0,sp,#32 1051 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); 1052 1053 add r1,sp,#96 1054 add r2,sp,#64 1055 add r0,sp,#64 1056 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); 1057 1058 add r1,sp,#0 1059 add r2,sp,#0 1060 add r0,sp,#128 1061 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); 1062 1063 add r1,sp,#64 1064 add r2,sp,#32 1065 add r0,sp,#32 1066 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); 1067 1068 ldr r0,[sp,#32*5] 1069 add r1,sp,#128 1070 add r0,r0,#32 1071 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); 1072 1073 add r1,sp,#32 1074 add r0,sp,#32 1075 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); 1076 1077 add r1,sp,#96 1078 add r2,sp,#0 1079 add r0,sp,#0 1080 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); 1081 1082 add r0,sp,#128 1083 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); 1084 1085 ldr r0,[sp,#32*5] 1086 add r1,sp,#32 1087 add r2,sp,#32 1088 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); 1089 1090 add r2,sp,#128 1091 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); 1092 1093 add r2,sp,#0 1094 add r0,sp,#0 1095 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); 1096 1097 add r1,sp,#32 1098 add r2,sp,#0 1099 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); 1100 1101 ldr r0,[sp,#32*5] 1102 add r2,r0,#32 1103 add r0,r0,#32 1104 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); 1105 1106 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" 1107#if __ARM_ARCH__>=5 || !defined(__thumb__) 1108 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 1109#else 1110 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 1111 bx lr @ interoperable with Thumb ISA:-) 1112#endif 1113 1114#endif // !OPENSSL_NO_ASM 1115