1%ifidn __OUTPUT_FORMAT__,obj 2section code use32 class=code align=64 3%elifidn __OUTPUT_FORMAT__,win32 4%ifdef __YASM_VERSION_ID__ 5%if __YASM_VERSION_ID__ < 01010000h 6%error yasm version 1.1.0 or later needed. 7%endif 8; Yasm automatically includes .00 and complains about redefining it. 9; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html 10%else 11$@feat.00 equ 1 12%endif 13section .text code align=64 14%else 15section .text code 16%endif 17;extern _OPENSSL_ia32cap_P 18global _bn_mul_mont 19align 16 20_bn_mul_mont: 21L$_bn_mul_mont_begin: 22 push ebp 23 push ebx 24 push esi 25 push edi 26 xor eax,eax 27 mov edi,DWORD [40+esp] 28 cmp edi,4 29 jl NEAR L$000just_leave 30 lea esi,[20+esp] 31 lea edx,[24+esp] 32 add edi,2 33 neg edi 34 lea ebp,[edi*4+esp-32] 35 neg edi 36 mov eax,ebp 37 sub eax,edx 38 and eax,2047 39 sub ebp,eax 40 xor edx,ebp 41 and edx,2048 42 xor edx,2048 43 sub ebp,edx 44 and ebp,-64 45 mov eax,esp 46 sub eax,ebp 47 and eax,-4096 48 mov edx,esp 49 lea esp,[eax*1+ebp] 50 mov eax,DWORD [esp] 51 cmp esp,ebp 52 ja NEAR L$001page_walk 53 jmp NEAR L$002page_walk_done 54align 16 55L$001page_walk: 56 lea esp,[esp-4096] 57 mov eax,DWORD [esp] 58 cmp esp,ebp 59 ja NEAR L$001page_walk 60L$002page_walk_done: 61 mov eax,DWORD [esi] 62 mov ebx,DWORD [4+esi] 63 mov ecx,DWORD [8+esi] 64 mov ebp,DWORD [12+esi] 65 mov esi,DWORD [16+esi] 66 mov esi,DWORD [esi] 67 mov DWORD [4+esp],eax 68 mov DWORD [8+esp],ebx 69 mov DWORD [12+esp],ecx 70 mov DWORD [16+esp],ebp 71 mov DWORD [20+esp],esi 72 lea ebx,[edi-3] 73 mov DWORD [24+esp],edx 74 lea eax,[_OPENSSL_ia32cap_P] 75 bt DWORD [eax],26 76 jnc NEAR L$003non_sse2 77 mov eax,-1 78 movd mm7,eax 79 mov esi,DWORD [8+esp] 80 mov edi,DWORD [12+esp] 81 mov ebp,DWORD [16+esp] 82 xor edx,edx 83 xor ecx,ecx 84 movd mm4,DWORD [edi] 85 movd mm5,DWORD [esi] 86 movd mm3,DWORD [ebp] 87 pmuludq mm5,mm4 88 movq mm2,mm5 89 movq mm0,mm5 90 pand mm0,mm7 91 pmuludq mm5,[20+esp] 92 pmuludq mm3,mm5 93 paddq mm3,mm0 94 movd mm1,DWORD [4+ebp] 95 movd mm0,DWORD [4+esi] 96 psrlq mm2,32 97 psrlq mm3,32 98 inc ecx 99align 16 100L$0041st: 101 pmuludq mm0,mm4 102 pmuludq mm1,mm5 103 paddq mm2,mm0 104 paddq mm3,mm1 105 movq mm0,mm2 106 pand mm0,mm7 107 movd mm1,DWORD [4+ecx*4+ebp] 108 paddq mm3,mm0 109 movd mm0,DWORD [4+ecx*4+esi] 110 psrlq mm2,32 111 movd DWORD [28+ecx*4+esp],mm3 112 psrlq mm3,32 113 lea ecx,[1+ecx] 114 cmp ecx,ebx 115 jl NEAR L$0041st 116 pmuludq mm0,mm4 117 pmuludq mm1,mm5 118 paddq mm2,mm0 119 paddq mm3,mm1 120 movq mm0,mm2 121 pand mm0,mm7 122 paddq mm3,mm0 123 movd DWORD [28+ecx*4+esp],mm3 124 psrlq mm2,32 125 psrlq mm3,32 126 paddq mm3,mm2 127 movq [32+ebx*4+esp],mm3 128 inc edx 129L$005outer: 130 xor ecx,ecx 131 movd mm4,DWORD [edx*4+edi] 132 movd mm5,DWORD [esi] 133 movd mm6,DWORD [32+esp] 134 movd mm3,DWORD [ebp] 135 pmuludq mm5,mm4 136 paddq mm5,mm6 137 movq mm0,mm5 138 movq mm2,mm5 139 pand mm0,mm7 140 pmuludq mm5,[20+esp] 141 pmuludq mm3,mm5 142 paddq mm3,mm0 143 movd mm6,DWORD [36+esp] 144 movd mm1,DWORD [4+ebp] 145 movd mm0,DWORD [4+esi] 146 psrlq mm2,32 147 psrlq mm3,32 148 paddq mm2,mm6 149 inc ecx 150 dec ebx 151L$006inner: 152 pmuludq mm0,mm4 153 pmuludq mm1,mm5 154 paddq mm2,mm0 155 paddq mm3,mm1 156 movq mm0,mm2 157 movd mm6,DWORD [36+ecx*4+esp] 158 pand mm0,mm7 159 movd mm1,DWORD [4+ecx*4+ebp] 160 paddq mm3,mm0 161 movd mm0,DWORD [4+ecx*4+esi] 162 psrlq mm2,32 163 movd DWORD [28+ecx*4+esp],mm3 164 psrlq mm3,32 165 paddq mm2,mm6 166 dec ebx 167 lea ecx,[1+ecx] 168 jnz NEAR L$006inner 169 mov ebx,ecx 170 pmuludq mm0,mm4 171 pmuludq mm1,mm5 172 paddq mm2,mm0 173 paddq mm3,mm1 174 movq mm0,mm2 175 pand mm0,mm7 176 paddq mm3,mm0 177 movd DWORD [28+ecx*4+esp],mm3 178 psrlq mm2,32 179 psrlq mm3,32 180 movd mm6,DWORD [36+ebx*4+esp] 181 paddq mm3,mm2 182 paddq mm3,mm6 183 movq [32+ebx*4+esp],mm3 184 lea edx,[1+edx] 185 cmp edx,ebx 186 jle NEAR L$005outer 187 emms 188 jmp NEAR L$007common_tail 189align 16 190L$003non_sse2: 191 mov esi,DWORD [8+esp] 192 lea ebp,[1+ebx] 193 mov edi,DWORD [12+esp] 194 xor ecx,ecx 195 mov edx,esi 196 and ebp,1 197 sub edx,edi 198 lea eax,[4+ebx*4+edi] 199 or ebp,edx 200 mov edi,DWORD [edi] 201 jz NEAR L$008bn_sqr_mont 202 mov DWORD [28+esp],eax 203 mov eax,DWORD [esi] 204 xor edx,edx 205align 16 206L$009mull: 207 mov ebp,edx 208 mul edi 209 add ebp,eax 210 lea ecx,[1+ecx] 211 adc edx,0 212 mov eax,DWORD [ecx*4+esi] 213 cmp ecx,ebx 214 mov DWORD [28+ecx*4+esp],ebp 215 jl NEAR L$009mull 216 mov ebp,edx 217 mul edi 218 mov edi,DWORD [20+esp] 219 add eax,ebp 220 mov esi,DWORD [16+esp] 221 adc edx,0 222 imul edi,DWORD [32+esp] 223 mov DWORD [32+ebx*4+esp],eax 224 xor ecx,ecx 225 mov DWORD [36+ebx*4+esp],edx 226 mov DWORD [40+ebx*4+esp],ecx 227 mov eax,DWORD [esi] 228 mul edi 229 add eax,DWORD [32+esp] 230 mov eax,DWORD [4+esi] 231 adc edx,0 232 inc ecx 233 jmp NEAR L$0102ndmadd 234align 16 235L$0111stmadd: 236 mov ebp,edx 237 mul edi 238 add ebp,DWORD [32+ecx*4+esp] 239 lea ecx,[1+ecx] 240 adc edx,0 241 add ebp,eax 242 mov eax,DWORD [ecx*4+esi] 243 adc edx,0 244 cmp ecx,ebx 245 mov DWORD [28+ecx*4+esp],ebp 246 jl NEAR L$0111stmadd 247 mov ebp,edx 248 mul edi 249 add eax,DWORD [32+ebx*4+esp] 250 mov edi,DWORD [20+esp] 251 adc edx,0 252 mov esi,DWORD [16+esp] 253 add ebp,eax 254 adc edx,0 255 imul edi,DWORD [32+esp] 256 xor ecx,ecx 257 add edx,DWORD [36+ebx*4+esp] 258 mov DWORD [32+ebx*4+esp],ebp 259 adc ecx,0 260 mov eax,DWORD [esi] 261 mov DWORD [36+ebx*4+esp],edx 262 mov DWORD [40+ebx*4+esp],ecx 263 mul edi 264 add eax,DWORD [32+esp] 265 mov eax,DWORD [4+esi] 266 adc edx,0 267 mov ecx,1 268align 16 269L$0102ndmadd: 270 mov ebp,edx 271 mul edi 272 add ebp,DWORD [32+ecx*4+esp] 273 lea ecx,[1+ecx] 274 adc edx,0 275 add ebp,eax 276 mov eax,DWORD [ecx*4+esi] 277 adc edx,0 278 cmp ecx,ebx 279 mov DWORD [24+ecx*4+esp],ebp 280 jl NEAR L$0102ndmadd 281 mov ebp,edx 282 mul edi 283 add ebp,DWORD [32+ebx*4+esp] 284 adc edx,0 285 add ebp,eax 286 adc edx,0 287 mov DWORD [28+ebx*4+esp],ebp 288 xor eax,eax 289 mov ecx,DWORD [12+esp] 290 add edx,DWORD [36+ebx*4+esp] 291 adc eax,DWORD [40+ebx*4+esp] 292 lea ecx,[4+ecx] 293 mov DWORD [32+ebx*4+esp],edx 294 cmp ecx,DWORD [28+esp] 295 mov DWORD [36+ebx*4+esp],eax 296 je NEAR L$007common_tail 297 mov edi,DWORD [ecx] 298 mov esi,DWORD [8+esp] 299 mov DWORD [12+esp],ecx 300 xor ecx,ecx 301 xor edx,edx 302 mov eax,DWORD [esi] 303 jmp NEAR L$0111stmadd 304align 16 305L$008bn_sqr_mont: 306 mov DWORD [esp],ebx 307 mov DWORD [12+esp],ecx 308 mov eax,edi 309 mul edi 310 mov DWORD [32+esp],eax 311 mov ebx,edx 312 shr edx,1 313 and ebx,1 314 inc ecx 315align 16 316L$012sqr: 317 mov eax,DWORD [ecx*4+esi] 318 mov ebp,edx 319 mul edi 320 add eax,ebp 321 lea ecx,[1+ecx] 322 adc edx,0 323 lea ebp,[eax*2+ebx] 324 shr eax,31 325 cmp ecx,DWORD [esp] 326 mov ebx,eax 327 mov DWORD [28+ecx*4+esp],ebp 328 jl NEAR L$012sqr 329 mov eax,DWORD [ecx*4+esi] 330 mov ebp,edx 331 mul edi 332 add eax,ebp 333 mov edi,DWORD [20+esp] 334 adc edx,0 335 mov esi,DWORD [16+esp] 336 lea ebp,[eax*2+ebx] 337 imul edi,DWORD [32+esp] 338 shr eax,31 339 mov DWORD [32+ecx*4+esp],ebp 340 lea ebp,[edx*2+eax] 341 mov eax,DWORD [esi] 342 shr edx,31 343 mov DWORD [36+ecx*4+esp],ebp 344 mov DWORD [40+ecx*4+esp],edx 345 mul edi 346 add eax,DWORD [32+esp] 347 mov ebx,ecx 348 adc edx,0 349 mov eax,DWORD [4+esi] 350 mov ecx,1 351align 16 352L$0133rdmadd: 353 mov ebp,edx 354 mul edi 355 add ebp,DWORD [32+ecx*4+esp] 356 adc edx,0 357 add ebp,eax 358 mov eax,DWORD [4+ecx*4+esi] 359 adc edx,0 360 mov DWORD [28+ecx*4+esp],ebp 361 mov ebp,edx 362 mul edi 363 add ebp,DWORD [36+ecx*4+esp] 364 lea ecx,[2+ecx] 365 adc edx,0 366 add ebp,eax 367 mov eax,DWORD [ecx*4+esi] 368 adc edx,0 369 cmp ecx,ebx 370 mov DWORD [24+ecx*4+esp],ebp 371 jl NEAR L$0133rdmadd 372 mov ebp,edx 373 mul edi 374 add ebp,DWORD [32+ebx*4+esp] 375 adc edx,0 376 add ebp,eax 377 adc edx,0 378 mov DWORD [28+ebx*4+esp],ebp 379 mov ecx,DWORD [12+esp] 380 xor eax,eax 381 mov esi,DWORD [8+esp] 382 add edx,DWORD [36+ebx*4+esp] 383 adc eax,DWORD [40+ebx*4+esp] 384 mov DWORD [32+ebx*4+esp],edx 385 cmp ecx,ebx 386 mov DWORD [36+ebx*4+esp],eax 387 je NEAR L$007common_tail 388 mov edi,DWORD [4+ecx*4+esi] 389 lea ecx,[1+ecx] 390 mov eax,edi 391 mov DWORD [12+esp],ecx 392 mul edi 393 add eax,DWORD [32+ecx*4+esp] 394 adc edx,0 395 mov DWORD [32+ecx*4+esp],eax 396 xor ebp,ebp 397 cmp ecx,ebx 398 lea ecx,[1+ecx] 399 je NEAR L$014sqrlast 400 mov ebx,edx 401 shr edx,1 402 and ebx,1 403align 16 404L$015sqradd: 405 mov eax,DWORD [ecx*4+esi] 406 mov ebp,edx 407 mul edi 408 add eax,ebp 409 lea ebp,[eax*1+eax] 410 adc edx,0 411 shr eax,31 412 add ebp,DWORD [32+ecx*4+esp] 413 lea ecx,[1+ecx] 414 adc eax,0 415 add ebp,ebx 416 adc eax,0 417 cmp ecx,DWORD [esp] 418 mov DWORD [28+ecx*4+esp],ebp 419 mov ebx,eax 420 jle NEAR L$015sqradd 421 mov ebp,edx 422 add edx,edx 423 shr ebp,31 424 add edx,ebx 425 adc ebp,0 426L$014sqrlast: 427 mov edi,DWORD [20+esp] 428 mov esi,DWORD [16+esp] 429 imul edi,DWORD [32+esp] 430 add edx,DWORD [32+ecx*4+esp] 431 mov eax,DWORD [esi] 432 adc ebp,0 433 mov DWORD [32+ecx*4+esp],edx 434 mov DWORD [36+ecx*4+esp],ebp 435 mul edi 436 add eax,DWORD [32+esp] 437 lea ebx,[ecx-1] 438 adc edx,0 439 mov ecx,1 440 mov eax,DWORD [4+esi] 441 jmp NEAR L$0133rdmadd 442align 16 443L$007common_tail: 444 mov ebp,DWORD [16+esp] 445 mov edi,DWORD [4+esp] 446 lea esi,[32+esp] 447 mov eax,DWORD [esi] 448 mov ecx,ebx 449 xor edx,edx 450align 16 451L$016sub: 452 sbb eax,DWORD [edx*4+ebp] 453 mov DWORD [edx*4+edi],eax 454 dec ecx 455 mov eax,DWORD [4+edx*4+esi] 456 lea edx,[1+edx] 457 jge NEAR L$016sub 458 sbb eax,0 459 and esi,eax 460 not eax 461 mov ebp,edi 462 and ebp,eax 463 or esi,ebp 464align 16 465L$017copy: 466 mov eax,DWORD [ebx*4+esi] 467 mov DWORD [ebx*4+edi],eax 468 mov DWORD [32+ebx*4+esp],ecx 469 dec ebx 470 jge NEAR L$017copy 471 mov esp,DWORD [24+esp] 472 mov eax,1 473L$000just_leave: 474 pop edi 475 pop esi 476 pop ebx 477 pop ebp 478 ret 479db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 480db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 481db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 482db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 483db 111,114,103,62,0 484segment .bss 485common _OPENSSL_ia32cap_P 16 486