1#! /usr/bin/env perl 2# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions 6# are met: 7# 8# 1. Redistributions of source code must retain the above copyright 9# notice, this list of conditions and the following disclaimer. 10# 11# 2. Redistributions in binary form must reproduce the above copyright 12# notice, this list of conditions and the following disclaimer in 13# the documentation and/or other materials provided with the 14# distribution. 15# 16# 3. All advertising materials mentioning features or use of this 17# software must display the following acknowledgment: 18# "This product includes software developed by the OpenSSL Project 19# for use in the OpenSSL Toolkit. (http://www.openssl.org/)" 20# 21# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to 22# endorse or promote products derived from this software without 23# prior written permission. For written permission, please contact 24# openssl-core@openssl.org. 25# 26# 5. Products derived from this software may not be called "OpenSSL" 27# nor may "OpenSSL" appear in their names without prior written 28# permission of the OpenSSL Project. 29# 30# 6. Redistributions of any form whatsoever must retain the following 31# acknowledgment: 32# "This product includes software developed by the OpenSSL Project 33# for use in the OpenSSL Toolkit (http://www.openssl.org/)" 34# 35# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY 36# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 37# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 38# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR 39# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 40# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 41# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 42# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 43# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 44# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 45# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 46# OF THE POSSIBILITY OF SUCH DAMAGE. 47# ==================================================================== 48# 49# This product includes cryptographic software written by Eric Young 50# (eay@cryptsoft.com). This product includes software written by Tim 51# Hudson (tjh@cryptsoft.com). 52 53 54# ==================================================================== 55# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 56# project. The module is, however, dual licensed under OpenSSL and 57# CRYPTOGAMS licenses depending on where you obtain it. For further 58# details see http://www.openssl.org/~appro/cryptogams/. 59# ==================================================================== 60# 61# ECP_NISTZ256 module for x86/SSE2. 62# 63# October 2014. 64# 65# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 66# http://eprint.iacr.org/2013/816. In the process of adaptation 67# original .c module was made 32-bit savvy in order to make this 68# implementation possible. 69# 70# with/without -DECP_NISTZ256_ASM 71# Pentium +66-163% 72# PIII +72-172% 73# P4 +65-132% 74# Core2 +90-215% 75# Sandy Bridge +105-265% (contemporary i[57]-* are all close to this) 76# Atom +65-155% 77# Opteron +54-110% 78# Bulldozer +99-240% 79# VIA Nano +93-290% 80# 81# Ranges denote minimum and maximum improvement coefficients depending 82# on benchmark. Lower coefficients are for ECDSA sign, server-side 83# operation. Keep in mind that +200% means 3x improvement. 84 85$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 86push(@INC,"${dir}","${dir}../../../perlasm"); 87require "x86asm.pl"; 88 89$output=pop; 90open STDOUT,">$output"; 91 92&asm_init($ARGV[0],"ecp_nistz256-x86.pl",$ARGV[$#ARGV] eq "386"); 93 94$sse2=0; 95for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } 96 97&external_label("GFp_ia32cap_P") if ($sse2); 98 99 100######################################################################## 101# Keep in mind that constants are stored least to most significant word 102&static_label("ONE_mont"); 103&set_label("ONE_mont"); 104&data_word(1,0,0,-1,-1,-1,-2,0); 105 106 107&function_begin_B("_ecp_nistz256_div_by_2"); 108 # tmp = a is odd ? a+mod : a 109 # 110 # note that because mod has special form, i.e. consists of 111 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 112 # assigning least significant bit of input to one register, 113 # %ebp, and its negative to another, %edx. 114 115 &mov ("ebp",&DWP(0,"esi")); 116 &xor ("edx","edx"); 117 &mov ("ebx",&DWP(4,"esi")); 118 &mov ("eax","ebp"); 119 &and ("ebp",1); 120 &mov ("ecx",&DWP(8,"esi")); 121 &sub ("edx","ebp"); 122 123 &add ("eax","edx"); 124 &adc ("ebx","edx"); 125 &mov (&DWP(0,"edi"),"eax"); 126 &adc ("ecx","edx"); 127 &mov (&DWP(4,"edi"),"ebx"); 128 &mov (&DWP(8,"edi"),"ecx"); 129 130 &mov ("eax",&DWP(12,"esi")); 131 &mov ("ebx",&DWP(16,"esi")); 132 &adc ("eax",0); 133 &mov ("ecx",&DWP(20,"esi")); 134 &adc ("ebx",0); 135 &mov (&DWP(12,"edi"),"eax"); 136 &adc ("ecx",0); 137 &mov (&DWP(16,"edi"),"ebx"); 138 &mov (&DWP(20,"edi"),"ecx"); 139 140 &mov ("eax",&DWP(24,"esi")); 141 &mov ("ebx",&DWP(28,"esi")); 142 &adc ("eax","ebp"); 143 &adc ("ebx","edx"); 144 &mov (&DWP(24,"edi"),"eax"); 145 &sbb ("esi","esi"); # broadcast carry bit 146 &mov (&DWP(28,"edi"),"ebx"); 147 148 # ret = tmp >> 1 149 150 &mov ("eax",&DWP(0,"edi")); 151 &mov ("ebx",&DWP(4,"edi")); 152 &mov ("ecx",&DWP(8,"edi")); 153 &mov ("edx",&DWP(12,"edi")); 154 155 &shr ("eax",1); 156 &mov ("ebp","ebx"); 157 &shl ("ebx",31); 158 &or ("eax","ebx"); 159 160 &shr ("ebp",1); 161 &mov ("ebx","ecx"); 162 &shl ("ecx",31); 163 &mov (&DWP(0,"edi"),"eax"); 164 &or ("ebp","ecx"); 165 &mov ("eax",&DWP(16,"edi")); 166 167 &shr ("ebx",1); 168 &mov ("ecx","edx"); 169 &shl ("edx",31); 170 &mov (&DWP(4,"edi"),"ebp"); 171 &or ("ebx","edx"); 172 &mov ("ebp",&DWP(20,"edi")); 173 174 &shr ("ecx",1); 175 &mov ("edx","eax"); 176 &shl ("eax",31); 177 &mov (&DWP(8,"edi"),"ebx"); 178 &or ("ecx","eax"); 179 &mov ("ebx",&DWP(24,"edi")); 180 181 &shr ("edx",1); 182 &mov ("eax","ebp"); 183 &shl ("ebp",31); 184 &mov (&DWP(12,"edi"),"ecx"); 185 &or ("edx","ebp"); 186 &mov ("ecx",&DWP(28,"edi")); 187 188 &shr ("eax",1); 189 &mov ("ebp","ebx"); 190 &shl ("ebx",31); 191 &mov (&DWP(16,"edi"),"edx"); 192 &or ("eax","ebx"); 193 194 &shr ("ebp",1); 195 &mov ("ebx","ecx"); 196 &shl ("ecx",31); 197 &mov (&DWP(20,"edi"),"eax"); 198 &or ("ebp","ecx"); 199 200 &shr ("ebx",1); 201 &shl ("esi",31); 202 &mov (&DWP(24,"edi"),"ebp"); 203 &or ("ebx","esi"); # handle top-most carry bit 204 &mov (&DWP(28,"edi"),"ebx"); 205 206 &ret (); 207&function_end_B("_ecp_nistz256_div_by_2"); 208 209######################################################################## 210# void GFp_nistz256_add(BN_ULONG edi[8],const BN_ULONG esi[8], 211# const BN_ULONG ebp[8]); 212&function_begin("GFp_nistz256_add"); 213 &mov ("esi",&wparam(1)); 214 &mov ("ebp",&wparam(2)); 215 &mov ("edi",&wparam(0)); 216 &call ("_ecp_nistz256_add"); 217&function_end("GFp_nistz256_add"); 218 219&function_begin_B("_ecp_nistz256_add"); 220 &mov ("eax",&DWP(0,"esi")); 221 &mov ("ebx",&DWP(4,"esi")); 222 &mov ("ecx",&DWP(8,"esi")); 223 &add ("eax",&DWP(0,"ebp")); 224 &mov ("edx",&DWP(12,"esi")); 225 &adc ("ebx",&DWP(4,"ebp")); 226 &mov (&DWP(0,"edi"),"eax"); 227 &adc ("ecx",&DWP(8,"ebp")); 228 &mov (&DWP(4,"edi"),"ebx"); 229 &adc ("edx",&DWP(12,"ebp")); 230 &mov (&DWP(8,"edi"),"ecx"); 231 &mov (&DWP(12,"edi"),"edx"); 232 233 &mov ("eax",&DWP(16,"esi")); 234 &mov ("ebx",&DWP(20,"esi")); 235 &mov ("ecx",&DWP(24,"esi")); 236 &adc ("eax",&DWP(16,"ebp")); 237 &mov ("edx",&DWP(28,"esi")); 238 &adc ("ebx",&DWP(20,"ebp")); 239 &mov (&DWP(16,"edi"),"eax"); 240 &adc ("ecx",&DWP(24,"ebp")); 241 &mov (&DWP(20,"edi"),"ebx"); 242 &mov ("esi",0); 243 &adc ("edx",&DWP(28,"ebp")); 244 &mov (&DWP(24,"edi"),"ecx"); 245 &adc ("esi",0); 246 &mov (&DWP(28,"edi"),"edx"); 247 248 # if a+b >= modulus, subtract modulus. 249 # 250 # But since comparison implies subtraction, we subtract modulus 251 # to see if it borrows, and then subtract it for real if 252 # subtraction didn't borrow. 253 254 &mov ("eax",&DWP(0,"edi")); 255 &mov ("ebx",&DWP(4,"edi")); 256 &mov ("ecx",&DWP(8,"edi")); 257 &sub ("eax",-1); 258 &mov ("edx",&DWP(12,"edi")); 259 &sbb ("ebx",-1); 260 &mov ("eax",&DWP(16,"edi")); 261 &sbb ("ecx",-1); 262 &mov ("ebx",&DWP(20,"edi")); 263 &sbb ("edx",0); 264 &mov ("ecx",&DWP(24,"edi")); 265 &sbb ("eax",0); 266 &mov ("edx",&DWP(28,"edi")); 267 &sbb ("ebx",0); 268 &sbb ("ecx",1); 269 &sbb ("edx",-1); 270 &sbb ("esi",0); 271 272 # Note that because mod has special form, i.e. consists of 273 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 274 # by using borrow. 275 276 ¬ ("esi"); 277 &mov ("eax",&DWP(0,"edi")); 278 &mov ("ebp","esi"); 279 &mov ("ebx",&DWP(4,"edi")); 280 &shr ("ebp",31); 281 &mov ("ecx",&DWP(8,"edi")); 282 &sub ("eax","esi"); 283 &mov ("edx",&DWP(12,"edi")); 284 &sbb ("ebx","esi"); 285 &mov (&DWP(0,"edi"),"eax"); 286 &sbb ("ecx","esi"); 287 &mov (&DWP(4,"edi"),"ebx"); 288 &sbb ("edx",0); 289 &mov (&DWP(8,"edi"),"ecx"); 290 &mov (&DWP(12,"edi"),"edx"); 291 292 &mov ("eax",&DWP(16,"edi")); 293 &mov ("ebx",&DWP(20,"edi")); 294 &mov ("ecx",&DWP(24,"edi")); 295 &sbb ("eax",0); 296 &mov ("edx",&DWP(28,"edi")); 297 &sbb ("ebx",0); 298 &mov (&DWP(16,"edi"),"eax"); 299 &sbb ("ecx","ebp"); 300 &mov (&DWP(20,"edi"),"ebx"); 301 &sbb ("edx","esi"); 302 &mov (&DWP(24,"edi"),"ecx"); 303 &mov (&DWP(28,"edi"),"edx"); 304 305 &ret (); 306&function_end_B("_ecp_nistz256_add"); 307 308&function_begin_B("_ecp_nistz256_sub"); 309 &mov ("eax",&DWP(0,"esi")); 310 &mov ("ebx",&DWP(4,"esi")); 311 &mov ("ecx",&DWP(8,"esi")); 312 &sub ("eax",&DWP(0,"ebp")); 313 &mov ("edx",&DWP(12,"esi")); 314 &sbb ("ebx",&DWP(4,"ebp")); 315 &mov (&DWP(0,"edi"),"eax"); 316 &sbb ("ecx",&DWP(8,"ebp")); 317 &mov (&DWP(4,"edi"),"ebx"); 318 &sbb ("edx",&DWP(12,"ebp")); 319 &mov (&DWP(8,"edi"),"ecx"); 320 &mov (&DWP(12,"edi"),"edx"); 321 322 &mov ("eax",&DWP(16,"esi")); 323 &mov ("ebx",&DWP(20,"esi")); 324 &mov ("ecx",&DWP(24,"esi")); 325 &sbb ("eax",&DWP(16,"ebp")); 326 &mov ("edx",&DWP(28,"esi")); 327 &sbb ("ebx",&DWP(20,"ebp")); 328 &sbb ("ecx",&DWP(24,"ebp")); 329 &mov (&DWP(16,"edi"),"eax"); 330 &sbb ("edx",&DWP(28,"ebp")); 331 &mov (&DWP(20,"edi"),"ebx"); 332 &sbb ("esi","esi"); # broadcast borrow bit 333 &mov (&DWP(24,"edi"),"ecx"); 334 &mov (&DWP(28,"edi"),"edx"); 335 336 # if a-b borrows, add modulus. 337 # 338 # Note that because mod has special form, i.e. consists of 339 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 340 # assigning borrow bit to one register, %ebp, and its negative 341 # to another, %esi. But we started by calculating %esi... 342 343 &mov ("eax",&DWP(0,"edi")); 344 &mov ("ebp","esi"); 345 &mov ("ebx",&DWP(4,"edi")); 346 &shr ("ebp",31); 347 &mov ("ecx",&DWP(8,"edi")); 348 &add ("eax","esi"); 349 &mov ("edx",&DWP(12,"edi")); 350 &adc ("ebx","esi"); 351 &mov (&DWP(0,"edi"),"eax"); 352 &adc ("ecx","esi"); 353 &mov (&DWP(4,"edi"),"ebx"); 354 &adc ("edx",0); 355 &mov (&DWP(8,"edi"),"ecx"); 356 &mov (&DWP(12,"edi"),"edx"); 357 358 &mov ("eax",&DWP(16,"edi")); 359 &mov ("ebx",&DWP(20,"edi")); 360 &mov ("ecx",&DWP(24,"edi")); 361 &adc ("eax",0); 362 &mov ("edx",&DWP(28,"edi")); 363 &adc ("ebx",0); 364 &mov (&DWP(16,"edi"),"eax"); 365 &adc ("ecx","ebp"); 366 &mov (&DWP(20,"edi"),"ebx"); 367 &adc ("edx","esi"); 368 &mov (&DWP(24,"edi"),"ecx"); 369 &mov (&DWP(28,"edi"),"edx"); 370 371 &ret (); 372&function_end_B("_ecp_nistz256_sub"); 373 374######################################################################## 375# void GFp_nistz256_neg(BN_ULONG edi[8],const BN_ULONG esi[8]); 376&function_begin("GFp_nistz256_neg"); 377 &mov ("ebp",&wparam(1)); 378 &mov ("edi",&wparam(0)); 379 380 &xor ("eax","eax"); 381 &stack_push(8); 382 &mov (&DWP(0,"esp"),"eax"); 383 &mov ("esi","esp"); 384 &mov (&DWP(4,"esp"),"eax"); 385 &mov (&DWP(8,"esp"),"eax"); 386 &mov (&DWP(12,"esp"),"eax"); 387 &mov (&DWP(16,"esp"),"eax"); 388 &mov (&DWP(20,"esp"),"eax"); 389 &mov (&DWP(24,"esp"),"eax"); 390 &mov (&DWP(28,"esp"),"eax"); 391 392 &call ("_ecp_nistz256_sub"); 393 394 &stack_pop(8); 395&function_end("GFp_nistz256_neg"); 396 397&function_begin_B("_picup_eax"); 398 &mov ("eax",&DWP(0,"esp")); 399 &ret (); 400&function_end_B("_picup_eax"); 401 402######################################################################## 403# void GFp_nistz256_mul_mont(BN_ULONG edi[8],const BN_ULONG esi[8], 404# const BN_ULONG ebp[8]); 405&function_begin("GFp_nistz256_mul_mont"); 406 &mov ("esi",&wparam(1)); 407 &mov ("ebp",&wparam(2)); 408 if ($sse2) { 409 &call ("_picup_eax"); 410 &set_label("pic"); 411 &picmeup("eax","GFp_ia32cap_P","eax",&label("pic")); 412 &mov ("eax",&DWP(0,"eax")); } 413 &mov ("edi",&wparam(0)); 414 &call ("_ecp_nistz256_mul_mont"); 415&function_end("GFp_nistz256_mul_mont"); 416 417&function_begin_B("_ecp_nistz256_mul_mont"); 418 if ($sse2) { 419 # We always use SSE2 420 421 ######################################## 422 # SSE2 code path featuring 32x16-bit 423 # multiplications is ~2x faster than 424 # IALU counterpart (except on Atom)... 425 ######################################## 426 # stack layout: 427 # +------------------------------------+< %esp 428 # | 7 16-byte temporary XMM words, | 429 # | "sliding" toward lower address | 430 # . . 431 # +------------------------------------+ 432 # | unused XMM word | 433 # +------------------------------------+< +128,%ebx 434 # | 8 16-byte XMM words holding copies | 435 # | of a[i]<<64|a[i] | 436 # . . 437 # . . 438 # +------------------------------------+< +256 439 &mov ("edx","esp"); 440 &sub ("esp",0x100); 441 442 &movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy 443 &lea ("ebp",&DWP(4,"ebp")); 444 &pcmpeqd("xmm6","xmm6"); 445 &psrlq ("xmm6",48); # compose 0xffff<<64|0xffff 446 447 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 448 &and ("esp",-64); 449 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 450 &lea ("ebx",&DWP(0x80,"esp")); 451 452 &movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy 453 &pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy 454 &movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ... 455 &movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0] 456 &pmuludq("xmm0","xmm7"); # a[0]*b[0] 457 458 &movd ("xmm2",&DWP(4*2,"esi")); 459 &pshufd ("xmm1","xmm1",0b11001100); 460 &movdqa (&QWP(0x10,"ebx"),"xmm1"); 461 &pmuludq("xmm1","xmm7"); # a[1]*b[0] 462 463 &movq ("xmm4","xmm0"); # clear upper 64 bits 464 &pslldq("xmm4",6); 465 &paddq ("xmm4","xmm0"); 466 &movdqa("xmm5","xmm4"); 467 &psrldq("xmm4",10); # upper 32 bits of a[0]*b[0] 468 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0] 469 470 # Upper half of a[0]*b[i] is carried into next multiplication 471 # iteration, while lower one "participates" in actual reduction. 472 # Normally latter is done by accumulating result of multiplication 473 # of modulus by "magic" digit, but thanks to special form of modulus 474 # and "magic" digit it can be performed only with additions and 475 # subtractions (see note in IALU section below). Note that we are 476 # not bothered with carry bits, they are accumulated in "flatten" 477 # phase after all multiplications and reductions. 478 479 &movd ("xmm3",&DWP(4*3,"esi")); 480 &pshufd ("xmm2","xmm2",0b11001100); 481 &movdqa (&QWP(0x20,"ebx"),"xmm2"); 482 &pmuludq("xmm2","xmm7"); # a[2]*b[0] 483 &paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry 484 &movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0] 485 486 &movd ("xmm0",&DWP(4*4,"esi")); 487 &pshufd ("xmm3","xmm3",0b11001100); 488 &movdqa (&QWP(0x30,"ebx"),"xmm3"); 489 &pmuludq("xmm3","xmm7"); # a[3]*b[0] 490 &movdqa (&QWP(0x10,"esp"),"xmm2"); 491 492 &movd ("xmm1",&DWP(4*5,"esi")); 493 &pshufd ("xmm0","xmm0",0b11001100); 494 &movdqa (&QWP(0x40,"ebx"),"xmm0"); 495 &pmuludq("xmm0","xmm7"); # a[4]*b[0] 496 &paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step 497 &movdqa (&QWP(0x20,"esp"),"xmm3"); 498 499 &movd ("xmm2",&DWP(4*6,"esi")); 500 &pshufd ("xmm1","xmm1",0b11001100); 501 &movdqa (&QWP(0x50,"ebx"),"xmm1"); 502 &pmuludq("xmm1","xmm7"); # a[5]*b[0] 503 &movdqa (&QWP(0x30,"esp"),"xmm0"); 504 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 505 506 &movd ("xmm3",&DWP(4*7,"esi")); 507 &pshufd ("xmm2","xmm2",0b11001100); 508 &movdqa (&QWP(0x60,"ebx"),"xmm2"); 509 &pmuludq("xmm2","xmm7"); # a[6]*b[0] 510 &movdqa (&QWP(0x40,"esp"),"xmm1"); 511 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 512 513 &movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy 514 &pshufd ("xmm3","xmm3",0b11001100); 515 &movdqa (&QWP(0x70,"ebx"),"xmm3"); 516 &pmuludq("xmm3","xmm7"); # a[7]*b[0] 517 518 &pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y 519 &movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 520 &pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 521 522 &mov ("ecx",6); 523 &lea ("ebp",&DWP(4,"ebp")); 524 &jmp (&label("madd_sse2")); 525 526&set_label("madd_sse2",16); 527 &paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled] 528 &paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled] 529 &movdqa ("xmm1",&QWP(0x10,"ebx")); 530 &pmuludq("xmm0","xmm7"); # a[0]*b[i] 531 &movdqa(&QWP(0x50,"esp"),"xmm2"); 532 533 &movdqa ("xmm2",&QWP(0x20,"ebx")); 534 &pmuludq("xmm1","xmm7"); # a[1]*b[i] 535 &movdqa(&QWP(0x60,"esp"),"xmm3"); 536 &paddq ("xmm0",&QWP(0x00,"esp")); 537 538 &movdqa ("xmm3",&QWP(0x30,"ebx")); 539 &pmuludq("xmm2","xmm7"); # a[2]*b[i] 540 &movq ("xmm4","xmm0"); # clear upper 64 bits 541 &pslldq("xmm4",6); 542 &paddq ("xmm1",&QWP(0x10,"esp")); 543 &paddq ("xmm4","xmm0"); 544 &movdqa("xmm5","xmm4"); 545 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 546 547 &movdqa ("xmm0",&QWP(0x40,"ebx")); 548 &pmuludq("xmm3","xmm7"); # a[3]*b[i] 549 &paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry 550 &paddq ("xmm2",&QWP(0x20,"esp")); 551 &movdqa (&QWP(0x00,"esp"),"xmm1"); 552 553 &movdqa ("xmm1",&QWP(0x50,"ebx")); 554 &pmuludq("xmm0","xmm7"); # a[4]*b[i] 555 &paddq ("xmm3",&QWP(0x30,"esp")); 556 &movdqa (&QWP(0x10,"esp"),"xmm2"); 557 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 558 559 &movdqa ("xmm2",&QWP(0x60,"ebx")); 560 &pmuludq("xmm1","xmm7"); # a[5]*b[i] 561 &paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step 562 &paddq ("xmm0",&QWP(0x40,"esp")); 563 &movdqa (&QWP(0x20,"esp"),"xmm3"); 564 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 565 566 &movdqa ("xmm3","xmm7"); 567 &pmuludq("xmm2","xmm7"); # a[6]*b[i] 568 &movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy 569 &lea ("ebp",&DWP(4,"ebp")); 570 &paddq ("xmm1",&QWP(0x50,"esp")); 571 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 572 &movdqa (&QWP(0x30,"esp"),"xmm0"); 573 &pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y 574 575 &pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i] 576 &pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y 577 &movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0] 578 &movdqa (&QWP(0x40,"esp"),"xmm1"); 579 &paddq ("xmm2",&QWP(0x60,"esp")); 580 581 &dec ("ecx"); 582 &jnz (&label("madd_sse2")); 583 584 &paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled] 585 &paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled] 586 &movdqa ("xmm1",&QWP(0x10,"ebx")); 587 &pmuludq("xmm0","xmm7"); # a[0]*b[7] 588 &movdqa(&QWP(0x50,"esp"),"xmm2"); 589 590 &movdqa ("xmm2",&QWP(0x20,"ebx")); 591 &pmuludq("xmm1","xmm7"); # a[1]*b[7] 592 &movdqa(&QWP(0x60,"esp"),"xmm3"); 593 &paddq ("xmm0",&QWP(0x00,"esp")); 594 595 &movdqa ("xmm3",&QWP(0x30,"ebx")); 596 &pmuludq("xmm2","xmm7"); # a[2]*b[7] 597 &movq ("xmm4","xmm0"); # clear upper 64 bits 598 &pslldq("xmm4",6); 599 &paddq ("xmm1",&QWP(0x10,"esp")); 600 &paddq ("xmm4","xmm0"); 601 &movdqa("xmm5","xmm4"); 602 &psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0] 603 604 &movdqa ("xmm0",&QWP(0x40,"ebx")); 605 &pmuludq("xmm3","xmm7"); # a[3]*b[7] 606 &paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry 607 &paddq ("xmm2",&QWP(0x20,"esp")); 608 &movdqa (&QWP(0x00,"esp"),"xmm1"); 609 610 &movdqa ("xmm1",&QWP(0x50,"ebx")); 611 &pmuludq("xmm0","xmm7"); # a[4]*b[7] 612 &paddq ("xmm3",&QWP(0x30,"esp")); 613 &movdqa (&QWP(0x10,"esp"),"xmm2"); 614 &pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i] 615 616 &movdqa ("xmm2",&QWP(0x60,"ebx")); 617 &pmuludq("xmm1","xmm7"); # a[5]*b[7] 618 &paddq ("xmm3","xmm5"); # reduction step 619 &paddq ("xmm0",&QWP(0x40,"esp")); 620 &movdqa (&QWP(0x20,"esp"),"xmm3"); 621 &pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step 622 623 &movdqa ("xmm3",&QWP(0x70,"ebx")); 624 &pmuludq("xmm2","xmm7"); # a[6]*b[7] 625 &paddq ("xmm1",&QWP(0x50,"esp")); 626 &psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step 627 &movdqa (&QWP(0x30,"esp"),"xmm0"); 628 629 &pmuludq("xmm3","xmm7"); # a[7]*b[7] 630 &pcmpeqd("xmm7","xmm7"); 631 &movdqa ("xmm0",&QWP(0x00,"esp")); 632 &pslldq ("xmm7",8); 633 &movdqa (&QWP(0x40,"esp"),"xmm1"); 634 &paddq ("xmm2",&QWP(0x60,"esp")); 635 636 &paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step 637 &paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step 638 &movdqa(&QWP(0x50,"esp"),"xmm2"); 639 &movdqa(&QWP(0x60,"esp"),"xmm3"); 640 641 &movdqa ("xmm1",&QWP(0x10,"esp")); 642 &movdqa ("xmm2",&QWP(0x20,"esp")); 643 &movdqa ("xmm3",&QWP(0x30,"esp")); 644 645 &movq ("xmm4","xmm0"); # "flatten" 646 &pand ("xmm0","xmm7"); 647 &xor ("ebp","ebp"); 648 &pslldq ("xmm4",6); 649 &movq ("xmm5","xmm1"); 650 &paddq ("xmm0","xmm4"); 651 &pand ("xmm1","xmm7"); 652 &psrldq ("xmm0",6); 653 &movd ("eax","xmm0"); 654 &psrldq ("xmm0",4); 655 656 &paddq ("xmm5","xmm0"); 657 &movdqa ("xmm0",&QWP(0x40,"esp")); 658 &sub ("eax",-1); # start subtracting modulus, 659 # this is used to determine 660 # if result is larger/smaller 661 # than modulus (see below) 662 &pslldq ("xmm5",6); 663 &movq ("xmm4","xmm2"); 664 &paddq ("xmm1","xmm5"); 665 &pand ("xmm2","xmm7"); 666 &psrldq ("xmm1",6); 667 &mov (&DWP(4*0,"edi"),"eax"); 668 &movd ("eax","xmm1"); 669 &psrldq ("xmm1",4); 670 671 &paddq ("xmm4","xmm1"); 672 &movdqa ("xmm1",&QWP(0x50,"esp")); 673 &sbb ("eax",-1); 674 &pslldq ("xmm4",6); 675 &movq ("xmm5","xmm3"); 676 &paddq ("xmm2","xmm4"); 677 &pand ("xmm3","xmm7"); 678 &psrldq ("xmm2",6); 679 &mov (&DWP(4*1,"edi"),"eax"); 680 &movd ("eax","xmm2"); 681 &psrldq ("xmm2",4); 682 683 &paddq ("xmm5","xmm2"); 684 &movdqa ("xmm2",&QWP(0x60,"esp")); 685 &sbb ("eax",-1); 686 &pslldq ("xmm5",6); 687 &movq ("xmm4","xmm0"); 688 &paddq ("xmm3","xmm5"); 689 &pand ("xmm0","xmm7"); 690 &psrldq ("xmm3",6); 691 &mov (&DWP(4*2,"edi"),"eax"); 692 &movd ("eax","xmm3"); 693 &psrldq ("xmm3",4); 694 695 &paddq ("xmm4","xmm3"); 696 &sbb ("eax",0); 697 &pslldq ("xmm4",6); 698 &movq ("xmm5","xmm1"); 699 &paddq ("xmm0","xmm4"); 700 &pand ("xmm1","xmm7"); 701 &psrldq ("xmm0",6); 702 &mov (&DWP(4*3,"edi"),"eax"); 703 &movd ("eax","xmm0"); 704 &psrldq ("xmm0",4); 705 706 &paddq ("xmm5","xmm0"); 707 &sbb ("eax",0); 708 &pslldq ("xmm5",6); 709 &movq ("xmm4","xmm2"); 710 &paddq ("xmm1","xmm5"); 711 &pand ("xmm2","xmm7"); 712 &psrldq ("xmm1",6); 713 &movd ("ebx","xmm1"); 714 &psrldq ("xmm1",4); 715 &mov ("esp","edx"); 716 717 &paddq ("xmm4","xmm1"); 718 &pslldq ("xmm4",6); 719 &paddq ("xmm2","xmm4"); 720 &psrldq ("xmm2",6); 721 &movd ("ecx","xmm2"); 722 &psrldq ("xmm2",4); 723 &sbb ("ebx",0); 724 &movd ("edx","xmm2"); 725 &pextrw ("esi","xmm2",2); # top-most overflow bit 726 &sbb ("ecx",1); 727 &sbb ("edx",-1); 728 &sbb ("esi",0); # borrow from subtraction 729 730 # Final step is "if result > mod, subtract mod", and at this point 731 # we have result - mod written to output buffer, as well as borrow 732 # bit from this subtraction, and if borrow bit is set, we add 733 # modulus back. 734 # 735 # Note that because mod has special form, i.e. consists of 736 # 0xffffffff, 1 and 0s, we can conditionally synthesize it by 737 # assigning borrow bit to one register, %ebp, and its negative 738 # to another, %esi. But we started by calculating %esi... 739 740 &sub ("ebp","esi"); 741 &add (&DWP(4*0,"edi"),"esi"); # add modulus or zero 742 &adc (&DWP(4*1,"edi"),"esi"); 743 &adc (&DWP(4*2,"edi"),"esi"); 744 &adc (&DWP(4*3,"edi"),0); 745 &adc ("eax",0); 746 &adc ("ebx",0); 747 &mov (&DWP(4*4,"edi"),"eax"); 748 &adc ("ecx","ebp"); 749 &mov (&DWP(4*5,"edi"),"ebx"); 750 &adc ("edx","esi"); 751 &mov (&DWP(4*6,"edi"),"ecx"); 752 &mov (&DWP(4*7,"edi"),"edx"); 753 754 &ret (); 755 756} # Non-SSE2 code removed. 757 758&function_end_B("_ecp_nistz256_mul_mont"); 759 760######################################################################## 761# following subroutines are "literal" implementation of those found in 762# ecp_nistz256.c 763# 764######################################################################## 765# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 766# 767&static_label("point_double_shortcut"); 768&function_begin("GFp_nistz256_point_double"); 769{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 770 771 &mov ("esi",&wparam(1)); 772 773 # above map() describes stack layout with 5 temporary 774 # 256-bit vectors on top, then we take extra word for 775 # GFp_ia32cap_P copy. 776 &stack_push(8*5+1); 777 if ($sse2) { 778 &call ("_picup_eax"); 779 &set_label("pic"); 780 &picmeup("edx","GFp_ia32cap_P","eax",&label("pic")); 781 &mov ("ebp",&DWP(0,"edx")); } 782 783&set_label("point_double_shortcut"); 784 &mov ("eax",&DWP(0,"esi")); # copy in_x 785 &mov ("ebx",&DWP(4,"esi")); 786 &mov ("ecx",&DWP(8,"esi")); 787 &mov ("edx",&DWP(12,"esi")); 788 &mov (&DWP($in_x+0,"esp"),"eax"); 789 &mov (&DWP($in_x+4,"esp"),"ebx"); 790 &mov (&DWP($in_x+8,"esp"),"ecx"); 791 &mov (&DWP($in_x+12,"esp"),"edx"); 792 &mov ("eax",&DWP(16,"esi")); 793 &mov ("ebx",&DWP(20,"esi")); 794 &mov ("ecx",&DWP(24,"esi")); 795 &mov ("edx",&DWP(28,"esi")); 796 &mov (&DWP($in_x+16,"esp"),"eax"); 797 &mov (&DWP($in_x+20,"esp"),"ebx"); 798 &mov (&DWP($in_x+24,"esp"),"ecx"); 799 &mov (&DWP($in_x+28,"esp"),"edx"); 800 &mov (&DWP(32*5,"esp"),"ebp"); # GFp_ia32cap_P copy 801 802 &lea ("ebp",&DWP(32,"esi")); 803 &lea ("esi",&DWP(32,"esi")); 804 &lea ("edi",&DWP($S,"esp")); 805 &call ("_ecp_nistz256_add"); # p256_mul_by_2(S, in_y); 806 807 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 808 &mov ("esi",64); 809 &add ("esi",&wparam(1)); 810 &lea ("edi",&DWP($Zsqr,"esp")); 811 &mov ("ebp","esi"); 812 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Zsqr, in_z); 813 814 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 815 &lea ("esi",&DWP($S,"esp")); 816 &lea ("ebp",&DWP($S,"esp")); 817 &lea ("edi",&DWP($S,"esp")); 818 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(S, S); 819 820 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 821 &mov ("ebp",&wparam(1)); 822 &lea ("esi",&DWP(32,"ebp")); 823 &lea ("ebp",&DWP(64,"ebp")); 824 &lea ("edi",&DWP($tmp0,"esp")); 825 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(tmp0, in_z, in_y); 826 827 &lea ("esi",&DWP($in_x,"esp")); 828 &lea ("ebp",&DWP($Zsqr,"esp")); 829 &lea ("edi",&DWP($M,"esp")); 830 &call ("_ecp_nistz256_add"); # p256_add(M, in_x, Zsqr); 831 832 &mov ("edi",64); 833 &lea ("esi",&DWP($tmp0,"esp")); 834 &lea ("ebp",&DWP($tmp0,"esp")); 835 &add ("edi",&wparam(0)); 836 &call ("_ecp_nistz256_add"); # p256_mul_by_2(res_z, tmp0); 837 838 &lea ("esi",&DWP($in_x,"esp")); 839 &lea ("ebp",&DWP($Zsqr,"esp")); 840 &lea ("edi",&DWP($Zsqr,"esp")); 841 &call ("_ecp_nistz256_sub"); # p256_sub(Zsqr, in_x, Zsqr); 842 843 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 844 &lea ("esi",&DWP($S,"esp")); 845 &lea ("ebp",&DWP($S,"esp")); 846 &lea ("edi",&DWP($tmp0,"esp")); 847 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(tmp0, S); 848 849 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 850 &lea ("esi",&DWP($M,"esp")); 851 &lea ("ebp",&DWP($Zsqr,"esp")); 852 &lea ("edi",&DWP($M,"esp")); 853 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(M, M, Zsqr); 854 855 &mov ("edi",32); 856 &lea ("esi",&DWP($tmp0,"esp")); 857 &add ("edi",&wparam(0)); 858 &call ("_ecp_nistz256_div_by_2"); # p256_div_by_2(res_y, tmp0); 859 860 &lea ("esi",&DWP($M,"esp")); 861 &lea ("ebp",&DWP($M,"esp")); 862 &lea ("edi",&DWP($tmp0,"esp")); 863 &call ("_ecp_nistz256_add"); # 1/2 p256_mul_by_3(M, M); 864 865 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 866 &lea ("esi",&DWP($in_x,"esp")); 867 &lea ("ebp",&DWP($S,"esp")); 868 &lea ("edi",&DWP($S,"esp")); 869 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, in_x); 870 871 &lea ("esi",&DWP($tmp0,"esp")); 872 &lea ("ebp",&DWP($M,"esp")); 873 &lea ("edi",&DWP($M,"esp")); 874 &call ("_ecp_nistz256_add"); # 2/2 p256_mul_by_3(M, M); 875 876 &lea ("esi",&DWP($S,"esp")); 877 &lea ("ebp",&DWP($S,"esp")); 878 &lea ("edi",&DWP($tmp0,"esp")); 879 &call ("_ecp_nistz256_add"); # p256_mul_by_2(tmp0, S); 880 881 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 882 &lea ("esi",&DWP($M,"esp")); 883 &lea ("ebp",&DWP($M,"esp")); 884 &mov ("edi",&wparam(0)); 885 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(res_x, M); 886 887 &mov ("esi","edi"); # %edi is still res_x here 888 &lea ("ebp",&DWP($tmp0,"esp")); 889 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, tmp0); 890 891 &lea ("esi",&DWP($S,"esp")); 892 &mov ("ebp","edi"); # %edi is still res_x 893 &lea ("edi",&DWP($S,"esp")); 894 &call ("_ecp_nistz256_sub"); # p256_sub(S, S, res_x); 895 896 &mov ("eax",&DWP(32*5,"esp")); # GFp_ia32cap_P copy 897 &mov ("esi","edi"); # %edi is still &S 898 &lea ("ebp",&DWP($M,"esp")); 899 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S, S, M); 900 901 &mov ("ebp",32); 902 &lea ("esi",&DWP($S,"esp")); 903 &add ("ebp",&wparam(0)); 904 &mov ("edi","ebp"); 905 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, S, res_y); 906 907 &stack_pop(8*5+1); 908} &function_end("GFp_nistz256_point_double"); 909 910######################################################################## 911# void GFp_nistz256_point_add_affine(P256_POINT *out, 912# const P256_POINT *in1, 913# const P256_POINT_AFFINE *in2); 914&function_begin("GFp_nistz256_point_add_affine"); 915{ 916 my ($res_x,$res_y,$res_z, 917 $in1_x,$in1_y,$in1_z, 918 $in2_x,$in2_y, 919 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 920 my $Z1sqr = $S2; 921 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 922 923 &mov ("esi",&wparam(1)); 924 925 # above map() describes stack layout with 15 temporary 926 # 256-bit vectors on top, then we take extra words for 927 # !in1infty, !in2infty, and GFp_ia32cap_P copy. 928 &stack_push(8*15+3); 929 if ($sse2) { 930 &call ("_picup_eax"); 931 &set_label("pic"); 932 &picmeup("edx","GFp_ia32cap_P","eax",&label("pic")); 933 &mov ("ebp",&DWP(0,"edx")); } 934 935 &lea ("edi",&DWP($in1_x,"esp")); 936 for($i=0;$i<96;$i+=16) { 937 &mov ("eax",&DWP($i+0,"esi")); # copy in1 938 &mov ("ebx",&DWP($i+4,"esi")); 939 &mov ("ecx",&DWP($i+8,"esi")); 940 &mov ("edx",&DWP($i+12,"esi")); 941 &mov (&DWP($i+0,"edi"),"eax"); 942 &mov (&DWP(32*15+8,"esp"),"ebp") if ($i==0); 943 &mov ("ebp","eax") if ($i==64); 944 &or ("ebp","eax") if ($i>64); 945 &mov (&DWP($i+4,"edi"),"ebx"); 946 &or ("ebp","ebx") if ($i>=64); 947 &mov (&DWP($i+8,"edi"),"ecx"); 948 &or ("ebp","ecx") if ($i>=64); 949 &mov (&DWP($i+12,"edi"),"edx"); 950 &or ("ebp","edx") if ($i>=64); 951 } 952 &xor ("eax","eax"); 953 &mov ("esi",&wparam(2)); 954 &sub ("eax","ebp"); 955 &or ("ebp","eax"); 956 &sar ("ebp",31); 957 &mov (&DWP(32*15+0,"esp"),"ebp"); # !in1infty 958 959 &lea ("edi",&DWP($in2_x,"esp")); 960 for($i=0;$i<64;$i+=16) { 961 &mov ("eax",&DWP($i+0,"esi")); # copy in2 962 &mov ("ebx",&DWP($i+4,"esi")); 963 &mov ("ecx",&DWP($i+8,"esi")); 964 &mov ("edx",&DWP($i+12,"esi")); 965 &mov (&DWP($i+0,"edi"),"eax"); 966 &mov ("ebp","eax") if ($i==0); 967 &or ("ebp","eax") if ($i!=0); 968 &mov (&DWP($i+4,"edi"),"ebx"); 969 &or ("ebp","ebx"); 970 &mov (&DWP($i+8,"edi"),"ecx"); 971 &or ("ebp","ecx"); 972 &mov (&DWP($i+12,"edi"),"edx"); 973 &or ("ebp","edx"); 974 } 975 &xor ("ebx","ebx"); 976 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 977 &sub ("ebx","ebp"); 978 &lea ("esi",&DWP($in1_z,"esp")); 979 &or ("ebx","ebp"); 980 &lea ("ebp",&DWP($in1_z,"esp")); 981 &sar ("ebx",31); 982 &lea ("edi",&DWP($Z1sqr,"esp")); 983 &mov (&DWP(32*15+4,"esp"),"ebx"); # !in2infty 984 985 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Z1sqr, in1_z); 986 987 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 988 &lea ("esi",&DWP($in2_x,"esp")); 989 &mov ("ebp","edi"); # %esi is stull &Z1sqr 990 &lea ("edi",&DWP($U2,"esp")); 991 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, Z1sqr, in2_x); 992 993 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 994 &lea ("esi",&DWP($in1_z,"esp")); 995 &lea ("ebp",&DWP($Z1sqr,"esp")); 996 &lea ("edi",&DWP($S2,"esp")); 997 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Z1sqr, in1_z); 998 999 &lea ("esi",&DWP($U2,"esp")); 1000 &lea ("ebp",&DWP($in1_x,"esp")); 1001 &lea ("edi",&DWP($H,"esp")); 1002 &call ("_ecp_nistz256_sub"); # p256_sub(H, U2, in1_x); 1003 1004 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1005 &lea ("esi",&DWP($in2_y,"esp")); 1006 &lea ("ebp",&DWP($S2,"esp")); 1007 &lea ("edi",&DWP($S2,"esp")); 1008 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, S2, in2_y); 1009 1010 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1011 &lea ("esi",&DWP($in1_z,"esp")); 1012 &lea ("ebp",&DWP($H,"esp")); 1013 &lea ("edi",&DWP($res_z,"esp")); 1014 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_z, H, in1_z); 1015 1016 &lea ("esi",&DWP($S2,"esp")); 1017 &lea ("ebp",&DWP($in1_y,"esp")); 1018 &lea ("edi",&DWP($R,"esp")); 1019 &call ("_ecp_nistz256_sub"); # p256_sub(R, S2, in1_y); 1020 1021 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1022 &lea ("esi",&DWP($H,"esp")); 1023 &lea ("ebp",&DWP($H,"esp")); 1024 &lea ("edi",&DWP($Hsqr,"esp")); 1025 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Hsqr, H); 1026 1027 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1028 &lea ("esi",&DWP($R,"esp")); 1029 &lea ("ebp",&DWP($R,"esp")); 1030 &lea ("edi",&DWP($Rsqr,"esp")); 1031 &call ("_ecp_nistz256_mul_mont"); # p256_sqr_mont(Rsqr, R); 1032 1033 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1034 &lea ("esi",&DWP($in1_x,"esp")); 1035 &lea ("ebp",&DWP($Hsqr,"esp")); 1036 &lea ("edi",&DWP($U2,"esp")); 1037 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(U2, in1_x, Hsqr); 1038 1039 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1040 &lea ("esi",&DWP($H,"esp")); 1041 &lea ("ebp",&DWP($Hsqr,"esp")); 1042 &lea ("edi",&DWP($Hcub,"esp")); 1043 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(Hcub, Hsqr, H); 1044 1045 &lea ("esi",&DWP($U2,"esp")); 1046 &lea ("ebp",&DWP($U2,"esp")); 1047 &lea ("edi",&DWP($Hsqr,"esp")); 1048 &call ("_ecp_nistz256_add"); # p256_mul_by_2(Hsqr, U2); 1049 1050 &lea ("esi",&DWP($Rsqr,"esp")); 1051 &lea ("ebp",&DWP($Hsqr,"esp")); 1052 &lea ("edi",&DWP($res_x,"esp")); 1053 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, Rsqr, Hsqr); 1054 1055 &lea ("esi",&DWP($res_x,"esp")); 1056 &lea ("ebp",&DWP($Hcub,"esp")); 1057 &lea ("edi",&DWP($res_x,"esp")); 1058 &call ("_ecp_nistz256_sub"); # p256_sub(res_x, res_x, Hcub); 1059 1060 &lea ("esi",&DWP($U2,"esp")); 1061 &lea ("ebp",&DWP($res_x,"esp")); 1062 &lea ("edi",&DWP($res_y,"esp")); 1063 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, U2, res_x); 1064 1065 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1066 &lea ("esi",&DWP($Hcub,"esp")); 1067 &lea ("ebp",&DWP($in1_y,"esp")); 1068 &lea ("edi",&DWP($S2,"esp")); 1069 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(S2, Hcub, in1_y); 1070 1071 &mov ("eax",&DWP(32*15+8,"esp")); # GFp_ia32cap_P copy 1072 &lea ("esi",&DWP($R,"esp")); 1073 &lea ("ebp",&DWP($res_y,"esp")); 1074 &lea ("edi",&DWP($res_y,"esp")); 1075 &call ("_ecp_nistz256_mul_mont"); # p256_mul_mont(res_y, res_y, R); 1076 1077 &lea ("esi",&DWP($res_y,"esp")); 1078 &lea ("ebp",&DWP($S2,"esp")); 1079 &lea ("edi",&DWP($res_y,"esp")); 1080 &call ("_ecp_nistz256_sub"); # p256_sub(res_y, res_y, S2); 1081 1082 &mov ("ebp",&DWP(32*15+0,"esp")); # !in1infty 1083 &mov ("esi",&DWP(32*15+4,"esp")); # !in2infty 1084 &mov ("edi",&wparam(0)); 1085 &mov ("edx","ebp"); 1086 ¬ ("ebp"); 1087 &and ("edx","esi"); 1088 &and ("ebp","esi"); 1089 ¬ ("esi"); 1090 1091 ######################################## 1092 # conditional moves 1093 for($i=64;$i<96;$i+=4) { 1094 my $one=@ONE_mont[($i-64)/4]; 1095 1096 &mov ("eax","edx"); 1097 &and ("eax",&DWP($res_x+$i,"esp")); 1098 &mov ("ebx","ebp") if ($one && $one!=-1); 1099 &and ("ebx",$one) if ($one && $one!=-1); 1100 &mov ("ecx","esi"); 1101 &and ("ecx",&DWP($in1_x+$i,"esp")); 1102 &or ("eax",$one==-1?"ebp":"ebx") if ($one); 1103 &or ("eax","ecx"); 1104 &mov (&DWP($i,"edi"),"eax"); 1105 } 1106 for($i=0;$i<64;$i+=4) { 1107 &mov ("eax","edx"); 1108 &and ("eax",&DWP($res_x+$i,"esp")); 1109 &mov ("ebx","ebp"); 1110 &and ("ebx",&DWP($in2_x+$i,"esp")); 1111 &mov ("ecx","esi"); 1112 &and ("ecx",&DWP($in1_x+$i,"esp")); 1113 &or ("eax","ebx"); 1114 &or ("eax","ecx"); 1115 &mov (&DWP($i,"edi"),"eax"); 1116 } 1117 &stack_pop(8*15+3); 1118} &function_end("GFp_nistz256_point_add_affine"); 1119 1120&asm_finish(); 1121 1122close STDOUT or die "error closing STDOUT"; 1123