1#! /usr/bin/env perl 2# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# September 2011 18# 19# Assembler helpers for Padlock engine. Compared to original engine 20# version relying on inline assembler and compiled with gcc 3.4.6 it 21# was measured to provide ~100% improvement on misaligned data in ECB 22# mode and ~75% in CBC mode. For aligned data improvement can be 23# observed for short inputs only, e.g. 45% for 64-byte messages in 24# ECB mode, 20% in CBC. Difference in performance for aligned vs. 25# misaligned data depends on misalignment and is either ~1.8x or 2.9x. 26# These are approximately same factors as for hardware support, so 27# there is little reason to rely on the latter. On the contrary, it 28# might actually hurt performance in mixture of aligned and misaligned 29# buffers, because a) if you choose to flip 'align' flag in control 30# word on per-buffer basis, then you'd have to reload key context, 31# which incurs penalty; b) if you choose to set 'align' flag 32# permanently, it limits performance even for aligned data to ~1/2. 33# All above mentioned results were collected on 1.5GHz C7. Nano on the 34# other hand handles unaligned data more gracefully. Depending on 35# algorithm and how unaligned data is, hardware can be up to 70% more 36# efficient than below software alignment procedures, nor does 'align' 37# flag have affect on aligned performance [if has any meaning at all]. 38# Therefore suggestion is to unconditionally set 'align' flag on Nano 39# for optimal performance. 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42push(@INC,"${dir}","${dir}../../crypto/perlasm"); 43require "x86asm.pl"; 44 45$output=pop; 46open STDOUT,">$output"; 47 48&asm_init($ARGV[0]); 49 50%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata 51$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 52 53$ctx="edx"; 54$out="edi"; 55$inp="esi"; 56$len="ecx"; 57$chunk="ebx"; 58 59&function_begin_B("padlock_capability"); 60 &push ("ebx"); 61 &pushf (); 62 &pop ("eax"); 63 &mov ("ecx","eax"); 64 &xor ("eax",1<<21); 65 &push ("eax"); 66 &popf (); 67 &pushf (); 68 &pop ("eax"); 69 &xor ("ecx","eax"); 70 &xor ("eax","eax"); 71 &bt ("ecx",21); 72 &jnc (&label("noluck")); 73 &cpuid (); 74 &xor ("eax","eax"); 75 &cmp ("ebx","0x".unpack("H*",'tneC')); 76 &jne (&label("zhaoxin")); 77 &cmp ("edx","0x".unpack("H*",'Hrua')); 78 &jne (&label("noluck")); 79 &cmp ("ecx","0x".unpack("H*",'slua')); 80 &jne (&label("noluck")); 81 &jmp (&label("zhaoxinEnd")); 82&set_label("zhaoxin"); 83 &cmp ("ebx","0x".unpack("H*",'hS ')); 84 &jne (&label("noluck")); 85 &cmp ("edx","0x".unpack("H*",'hgna')); 86 &jne (&label("noluck")); 87 &cmp ("ecx","0x".unpack("H*",' ia')); 88 &jne (&label("noluck")); 89&set_label("zhaoxinEnd"); 90 &mov ("eax",0xC0000000); 91 &cpuid (); 92 &mov ("edx","eax"); 93 &xor ("eax","eax"); 94 &cmp ("edx",0xC0000001); 95 &jb (&label("noluck")); 96 &mov ("eax",1); 97 &cpuid (); 98 &or ("eax",0x0f); 99 &xor ("ebx","ebx"); 100 &and ("eax",0x0fff); 101 &cmp ("eax",0x06ff); # check for Nano 102 &sete ("bl"); 103 &mov ("eax",0xC0000001); 104 &push ("ebx"); 105 &cpuid (); 106 &pop ("ebx"); 107 &mov ("eax","edx"); 108 &shl ("ebx",4); # bit#4 denotes Nano 109 &and ("eax",0xffffffef); 110 &or ("eax","ebx") 111&set_label("noluck"); 112 &pop ("ebx"); 113 &ret (); 114&function_end_B("padlock_capability") 115 116&function_begin_B("padlock_key_bswap"); 117 &mov ("edx",&wparam(0)); 118 &mov ("ecx",&DWP(240,"edx")); 119&set_label("bswap_loop"); 120 &mov ("eax",&DWP(0,"edx")); 121 &bswap ("eax"); 122 &mov (&DWP(0,"edx"),"eax"); 123 &lea ("edx",&DWP(4,"edx")); 124 &sub ("ecx",1); 125 &jnz (&label("bswap_loop")); 126 &ret (); 127&function_end_B("padlock_key_bswap"); 128 129# This is heuristic key context tracing. At first one 130# believes that one should use atomic swap instructions, 131# but it's not actually necessary. Point is that if 132# padlock_saved_context was changed by another thread 133# after we've read it and before we compare it with ctx, 134# our key *shall* be reloaded upon thread context switch 135# and we are therefore set in either case... 136&static_label("padlock_saved_context"); 137 138&function_begin_B("padlock_verify_context"); 139 &mov ($ctx,&wparam(0)); 140 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 141 &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); 142 &pushf (); 143 &call ("_padlock_verify_ctx"); 144&set_label("verify_pic_point"); 145 &lea ("esp",&DWP(4,"esp")); 146 &ret (); 147&function_end_B("padlock_verify_context"); 148 149&function_begin_B("_padlock_verify_ctx"); 150 &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context 151 &bt (&DWP(4,"esp"),30); # eflags 152 &jnc (&label("verified")); 153 &cmp ($ctx,&DWP(0,"eax")); 154 &je (&label("verified")); 155 &pushf (); 156 &popf (); 157&set_label("verified"); 158 &mov (&DWP(0,"eax"),$ctx); 159 &ret (); 160&function_end_B("_padlock_verify_ctx"); 161 162&function_begin_B("padlock_reload_key"); 163 &pushf (); 164 &popf (); 165 &ret (); 166&function_end_B("padlock_reload_key"); 167 168&function_begin_B("padlock_aes_block"); 169 &push ("edi"); 170 &push ("esi"); 171 &push ("ebx"); 172 &mov ($out,&wparam(0)); # must be 16-byte aligned 173 &mov ($inp,&wparam(1)); # must be 16-byte aligned 174 &mov ($ctx,&wparam(2)); 175 &mov ($len,1); 176 &lea ("ebx",&DWP(32,$ctx)); # key 177 &lea ($ctx,&DWP(16,$ctx)); # control word 178 &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb 179 &pop ("ebx"); 180 &pop ("esi"); 181 &pop ("edi"); 182 &ret (); 183&function_end_B("padlock_aes_block"); 184 185sub generate_mode { 186my ($mode,$opcode) = @_; 187# int padlock_$mode_encrypt(void *out, const void *inp, 188# struct padlock_cipher_data *ctx, size_t len); 189&function_begin("padlock_${mode}_encrypt"); 190 &mov ($out,&wparam(0)); 191 &mov ($inp,&wparam(1)); 192 &mov ($ctx,&wparam(2)); 193 &mov ($len,&wparam(3)); 194 &test ($ctx,15); 195 &jnz (&label("${mode}_abort")); 196 &test ($len,15); 197 &jnz (&label("${mode}_abort")); 198 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 199 &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); 200 &pushf (); 201 &cld (); 202 &call ("_padlock_verify_ctx"); 203&set_label("${mode}_pic_point"); 204 &lea ($ctx,&DWP(16,$ctx)); # control word 205 &xor ("eax","eax"); 206 if ($mode eq "ctr32") { 207 &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter 208 } else { 209 &xor ("ebx","ebx"); 210 &test (&DWP(0,$ctx),1<<5); # align bit in control word 211 &jnz (&label("${mode}_aligned")); 212 &test ($out,0x0f); 213 &setz ("al"); # !out_misaligned 214 &test ($inp,0x0f); 215 &setz ("bl"); # !inp_misaligned 216 &test ("eax","ebx"); 217 &jnz (&label("${mode}_aligned")); 218 &neg ("eax"); 219 } 220 &mov ($chunk,$PADLOCK_CHUNK); 221 ¬ ("eax"); # out_misaligned?-1:0 222 &lea ("ebp",&DWP(-24,"esp")); 223 &cmp ($len,$chunk); 224 &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 225 &and ("eax",$chunk); # out_misaligned?chunk:0 226 &mov ($chunk,$len); 227 &neg ("eax"); 228 &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK 229 &lea ("esp",&DWP(0,"eax","ebp")); # alloca 230 &mov ("eax",$PADLOCK_CHUNK); 231 &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK 232 &mov ("eax","ebp"); 233 &and ("ebp",-16); 234 &and ("esp",-16); 235 &mov (&DWP(16,"ebp"),"eax"); 236 if ($PADLOCK_PREFETCH{$mode}) { 237 &cmp ($len,$chunk); 238 &ja (&label("${mode}_loop")); 239 &mov ("eax",$inp); # check if prefetch crosses page 240 &cmp ("ebp","esp"); 241 &cmove ("eax",$out); 242 &add ("eax",$len); 243 &neg ("eax"); 244 &and ("eax",0xfff); # distance to page boundary 245 &cmp ("eax",$PADLOCK_PREFETCH{$mode}); 246 &mov ("eax",-$PADLOCK_PREFETCH{$mode}); 247 &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 248 &and ($chunk,"eax"); 249 &jz (&label("${mode}_unaligned_tail")); 250 } 251 &jmp (&label("${mode}_loop")); 252 253&set_label("${mode}_loop",16); 254 &mov (&DWP(0,"ebp"),$out); # save parameters 255 &mov (&DWP(4,"ebp"),$inp); 256 &mov (&DWP(8,"ebp"),$len); 257 &mov ($len,$chunk); 258 &mov (&DWP(12,"ebp"),$chunk); # chunk 259 if ($mode eq "ctr32") { 260 &mov ("ecx",&DWP(-4,$ctx)); 261 &xor ($out,$out); 262 &mov ("eax",&DWP(-8,$ctx)); # borrow $len 263&set_label("${mode}_prepare"); 264 &mov (&DWP(12,"esp",$out),"ecx"); 265 &bswap ("ecx"); 266 &movq (&QWP(0,"esp",$out),"mm0"); 267 &inc ("ecx"); 268 &mov (&DWP(8,"esp",$out),"eax"); 269 &bswap ("ecx"); 270 &lea ($out,&DWP(16,$out)); 271 &cmp ($out,$chunk); 272 &jb (&label("${mode}_prepare")); 273 274 &mov (&DWP(-4,$ctx),"ecx"); 275 &lea ($inp,&DWP(0,"esp")); 276 &lea ($out,&DWP(0,"esp")); 277 &mov ($len,$chunk); 278 } else { 279 &test ($out,0x0f); # out_misaligned 280 &cmovnz ($out,"esp"); 281 &test ($inp,0x0f); # inp_misaligned 282 &jz (&label("${mode}_inp_aligned")); 283 &shr ($len,2); 284 &data_byte(0xf3,0xa5); # rep movsl 285 &sub ($out,$chunk); 286 &mov ($len,$chunk); 287 &mov ($inp,$out); 288&set_label("${mode}_inp_aligned"); 289 } 290 &lea ("eax",&DWP(-16,$ctx)); # ivp 291 &lea ("ebx",&DWP(16,$ctx)); # key 292 &shr ($len,4); # len/=AES_BLOCK_SIZE 293 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 294 if ($mode !~ /ecb|ctr/) { 295 &movaps ("xmm0",&QWP(0,"eax")); 296 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 297 } 298 &mov ($out,&DWP(0,"ebp")); # restore parameters 299 &mov ($chunk,&DWP(12,"ebp")); 300 if ($mode eq "ctr32") { 301 &mov ($inp,&DWP(4,"ebp")); 302 &xor ($len,$len); 303&set_label("${mode}_xor"); 304 &movups ("xmm1",&QWP(0,$inp,$len)); 305 &lea ($len,&DWP(16,$len)); 306 &pxor ("xmm1",&QWP(-16,"esp",$len)); 307 &movups (&QWP(-16,$out,$len),"xmm1"); 308 &cmp ($len,$chunk); 309 &jb (&label("${mode}_xor")); 310 } else { 311 &test ($out,0x0f); 312 &jz (&label("${mode}_out_aligned")); 313 &mov ($len,$chunk); 314 &lea ($inp,&DWP(0,"esp")); 315 &shr ($len,2); 316 &data_byte(0xf3,0xa5); # rep movsl 317 &sub ($out,$chunk); 318&set_label("${mode}_out_aligned"); 319 &mov ($inp,&DWP(4,"ebp")); 320 } 321 &mov ($len,&DWP(8,"ebp")); 322 &add ($out,$chunk); 323 &add ($inp,$chunk); 324 &sub ($len,$chunk); 325 &mov ($chunk,$PADLOCK_CHUNK); 326 if (!$PADLOCK_PREFETCH{$mode}) { 327 &jnz (&label("${mode}_loop")); 328 } else { 329 &jz (&label("${mode}_break")); 330 &cmp ($len,$chunk); 331 &jae (&label("${mode}_loop")); 332 333&set_label("${mode}_unaligned_tail"); 334 &xor ("eax","eax"); 335 &cmp ("esp","ebp"); 336 &cmove ("eax",$len); 337 &sub ("esp","eax"); # alloca 338 &mov ("eax", $out); # save parameters 339 &mov ($chunk,$len); 340 &shr ($len,2); 341 &lea ($out,&DWP(0,"esp")); 342 &data_byte(0xf3,0xa5); # rep movsl 343 &mov ($inp,"esp"); 344 &mov ($out,"eax"); # restore parameters 345 &mov ($len,$chunk); 346 &jmp (&label("${mode}_loop")); 347 348&set_label("${mode}_break",16); 349 } 350 if ($mode ne "ctr32") { 351 &cmp ("esp","ebp"); 352 &je (&label("${mode}_done")); 353 } 354 &pxor ("xmm0","xmm0"); 355 &lea ("eax",&DWP(0,"esp")); 356&set_label("${mode}_bzero"); 357 &movaps (&QWP(0,"eax"),"xmm0"); 358 &lea ("eax",&DWP(16,"eax")); 359 &cmp ("ebp","eax"); 360 &ja (&label("${mode}_bzero")); 361 362&set_label("${mode}_done"); 363 &mov ("ebp",&DWP(16,"ebp")); 364 &lea ("esp",&DWP(24,"ebp")); 365 if ($mode ne "ctr32") { 366 &jmp (&label("${mode}_exit")); 367 368&set_label("${mode}_aligned",16); 369 if ($PADLOCK_PREFETCH{$mode}) { 370 &lea ("ebp",&DWP(0,$inp,$len)); 371 &neg ("ebp"); 372 &and ("ebp",0xfff); # distance to page boundary 373 &xor ("eax","eax"); 374 &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); 375 &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); 376 &cmovae ("ebp","eax"); 377 &and ("ebp",$len); # remainder 378 &sub ($len,"ebp"); 379 &jz (&label("${mode}_aligned_tail")); 380 } 381 &lea ("eax",&DWP(-16,$ctx)); # ivp 382 &lea ("ebx",&DWP(16,$ctx)); # key 383 &shr ($len,4); # len/=AES_BLOCK_SIZE 384 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 385 if ($mode ne "ecb") { 386 &movaps ("xmm0",&QWP(0,"eax")); 387 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 388 } 389 if ($PADLOCK_PREFETCH{$mode}) { 390 &test ("ebp","ebp"); 391 &jz (&label("${mode}_exit")); 392 393&set_label("${mode}_aligned_tail"); 394 &mov ($len,"ebp"); 395 &lea ("ebp",&DWP(-24,"esp")); 396 &mov ("esp","ebp"); 397 &mov ("eax","ebp"); 398 &sub ("esp",$len); 399 &and ("ebp",-16); 400 &and ("esp",-16); 401 &mov (&DWP(16,"ebp"),"eax"); 402 &mov ("eax", $out); # save parameters 403 &mov ($chunk,$len); 404 &shr ($len,2); 405 &lea ($out,&DWP(0,"esp")); 406 &data_byte(0xf3,0xa5); # rep movsl 407 &mov ($inp,"esp"); 408 &mov ($out,"eax"); # restore parameters 409 &mov ($len,$chunk); 410 &jmp (&label("${mode}_loop")); 411 } 412&set_label("${mode}_exit"); } 413 &mov ("eax",1); 414 &lea ("esp",&DWP(4,"esp")); # popf 415 &emms () if ($mode eq "ctr32"); 416&set_label("${mode}_abort"); 417&function_end("padlock_${mode}_encrypt"); 418} 419 420&generate_mode("ecb",0xc8); 421&generate_mode("cbc",0xd0); 422&generate_mode("cfb",0xe0); 423&generate_mode("ofb",0xe8); 424&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, 425 # because hardware CTR was introduced later 426 # and even has errata on certain C7 stepping. 427 # own implementation *always* works, though 428 # ~15% slower than dedicated hardware... 429 430&function_begin_B("padlock_xstore"); 431 &push ("edi"); 432 &mov ("edi",&wparam(0)); 433 &mov ("edx",&wparam(1)); 434 &data_byte(0x0f,0xa7,0xc0); # xstore 435 &pop ("edi"); 436 &ret (); 437&function_end_B("padlock_xstore"); 438 439&function_begin_B("_win32_segv_handler"); 440 &mov ("eax",1); # ExceptionContinueSearch 441 &mov ("edx",&wparam(0)); # *ExceptionRecord 442 &mov ("ecx",&wparam(2)); # *ContextRecord 443 &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION 444 &jne (&label("ret")); 445 &add (&DWP(184,"ecx"),4); # skip over rep sha* 446 &mov ("eax",0); # ExceptionContinueExecution 447&set_label("ret"); 448 &ret (); 449&function_end_B("_win32_segv_handler"); 450&safeseh("_win32_segv_handler") if ($::win32); 451 452&function_begin_B("padlock_sha1_oneshot"); 453 &push ("edi"); 454 &push ("esi"); 455 &xor ("eax","eax"); 456 &mov ("edi",&wparam(0)); 457 &mov ("esi",&wparam(1)); 458 &mov ("ecx",&wparam(2)); 459 if ($::win32 or $::coff) { 460 &push (&::islabel("_win32_segv_handler")); 461 &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 462 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 463 } 464 &mov ("edx","esp"); # put aside %esp 465 &add ("esp",-128); # 32 is enough but spec says 128 466 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 467 &and ("esp",-16); 468 &mov ("eax",&DWP(16,"edi")); 469 &movaps (&QWP(0,"esp"),"xmm0"); 470 &mov ("edi","esp"); 471 &mov (&DWP(16,"esp"),"eax"); 472 &xor ("eax","eax"); 473 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 474 &movaps ("xmm0",&QWP(0,"esp")); 475 &mov ("eax",&DWP(16,"esp")); 476 &mov ("esp","edx"); # restore %esp 477 if ($::win32 or $::coff) { 478 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 479 &lea ("esp",&DWP(4,"esp")); 480 } 481 &mov ("edi",&wparam(0)); 482 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 483 &mov (&DWP(16,"edi"),"eax"); 484 &pop ("esi"); 485 &pop ("edi"); 486 &ret (); 487&function_end_B("padlock_sha1_oneshot"); 488 489&function_begin_B("padlock_sha1_blocks"); 490 &push ("edi"); 491 &push ("esi"); 492 &mov ("edi",&wparam(0)); 493 &mov ("esi",&wparam(1)); 494 &mov ("edx","esp"); # put aside %esp 495 &mov ("ecx",&wparam(2)); 496 &add ("esp",-128); 497 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 498 &and ("esp",-16); 499 &mov ("eax",&DWP(16,"edi")); 500 &movaps (&QWP(0,"esp"),"xmm0"); 501 &mov ("edi","esp"); 502 &mov (&DWP(16,"esp"),"eax"); 503 &mov ("eax",-1); 504 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 505 &movaps ("xmm0",&QWP(0,"esp")); 506 &mov ("eax",&DWP(16,"esp")); 507 &mov ("esp","edx"); # restore %esp 508 &mov ("edi",&wparam(0)); 509 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 510 &mov (&DWP(16,"edi"),"eax"); 511 &pop ("esi"); 512 &pop ("edi"); 513 &ret (); 514&function_end_B("padlock_sha1_blocks"); 515 516&function_begin_B("padlock_sha256_oneshot"); 517 &push ("edi"); 518 &push ("esi"); 519 &xor ("eax","eax"); 520 &mov ("edi",&wparam(0)); 521 &mov ("esi",&wparam(1)); 522 &mov ("ecx",&wparam(2)); 523 if ($::win32 or $::coff) { 524 &push (&::islabel("_win32_segv_handler")); 525 &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 526 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 527 } 528 &mov ("edx","esp"); # put aside %esp 529 &add ("esp",-128); 530 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 531 &and ("esp",-16); 532 &movups ("xmm1",&QWP(16,"edi")); 533 &movaps (&QWP(0,"esp"),"xmm0"); 534 &mov ("edi","esp"); 535 &movaps (&QWP(16,"esp"),"xmm1"); 536 &xor ("eax","eax"); 537 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 538 &movaps ("xmm0",&QWP(0,"esp")); 539 &movaps ("xmm1",&QWP(16,"esp")); 540 &mov ("esp","edx"); # restore %esp 541 if ($::win32 or $::coff) { 542 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 543 &lea ("esp",&DWP(4,"esp")); 544 } 545 &mov ("edi",&wparam(0)); 546 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 547 &movups (&QWP(16,"edi"),"xmm1"); 548 &pop ("esi"); 549 &pop ("edi"); 550 &ret (); 551&function_end_B("padlock_sha256_oneshot"); 552 553&function_begin_B("padlock_sha256_blocks"); 554 &push ("edi"); 555 &push ("esi"); 556 &mov ("edi",&wparam(0)); 557 &mov ("esi",&wparam(1)); 558 &mov ("ecx",&wparam(2)); 559 &mov ("edx","esp"); # put aside %esp 560 &add ("esp",-128); 561 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 562 &and ("esp",-16); 563 &movups ("xmm1",&QWP(16,"edi")); 564 &movaps (&QWP(0,"esp"),"xmm0"); 565 &mov ("edi","esp"); 566 &movaps (&QWP(16,"esp"),"xmm1"); 567 &mov ("eax",-1); 568 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 569 &movaps ("xmm0",&QWP(0,"esp")); 570 &movaps ("xmm1",&QWP(16,"esp")); 571 &mov ("esp","edx"); # restore %esp 572 &mov ("edi",&wparam(0)); 573 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 574 &movups (&QWP(16,"edi"),"xmm1"); 575 &pop ("esi"); 576 &pop ("edi"); 577 &ret (); 578&function_end_B("padlock_sha256_blocks"); 579 580&function_begin_B("padlock_sha512_blocks"); 581 &push ("edi"); 582 &push ("esi"); 583 &mov ("edi",&wparam(0)); 584 &mov ("esi",&wparam(1)); 585 &mov ("ecx",&wparam(2)); 586 &mov ("edx","esp"); # put aside %esp 587 &add ("esp",-128); 588 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 589 &and ("esp",-16); 590 &movups ("xmm1",&QWP(16,"edi")); 591 &movups ("xmm2",&QWP(32,"edi")); 592 &movups ("xmm3",&QWP(48,"edi")); 593 &movaps (&QWP(0,"esp"),"xmm0"); 594 &mov ("edi","esp"); 595 &movaps (&QWP(16,"esp"),"xmm1"); 596 &movaps (&QWP(32,"esp"),"xmm2"); 597 &movaps (&QWP(48,"esp"),"xmm3"); 598 &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 599 &movaps ("xmm0",&QWP(0,"esp")); 600 &movaps ("xmm1",&QWP(16,"esp")); 601 &movaps ("xmm2",&QWP(32,"esp")); 602 &movaps ("xmm3",&QWP(48,"esp")); 603 &mov ("esp","edx"); # restore %esp 604 &mov ("edi",&wparam(0)); 605 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 606 &movups (&QWP(16,"edi"),"xmm1"); 607 &movups (&QWP(32,"edi"),"xmm2"); 608 &movups (&QWP(48,"edi"),"xmm3"); 609 &pop ("esi"); 610 &pop ("edi"); 611 &ret (); 612&function_end_B("padlock_sha512_blocks"); 613 614&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); 615&align (16); 616 617&dataseg(); 618# Essentially this variable belongs in thread local storage. 619# Having this variable global on the other hand can only cause 620# few bogus key reloads [if any at all on signle-CPU system], 621# so we accept the penalty... 622&set_label("padlock_saved_context",4); 623&data_word(0); 624 625&asm_finish(); 626 627close STDOUT; 628