1#! /usr/bin/env perl 2# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# September 2011 18# 19# Assembler helpers for Padlock engine. Compared to original engine 20# version relying on inline assembler and compiled with gcc 3.4.6 it 21# was measured to provide ~100% improvement on misaligned data in ECB 22# mode and ~75% in CBC mode. For aligned data improvement can be 23# observed for short inputs only, e.g. 45% for 64-byte messages in 24# ECB mode, 20% in CBC. Difference in performance for aligned vs. 25# misaligned data depends on misalignment and is either ~1.8x or 2.9x. 26# These are approximately same factors as for hardware support, so 27# there is little reason to rely on the latter. On the contrary, it 28# might actually hurt performance in mixture of aligned and misaligned 29# buffers, because a) if you choose to flip 'align' flag in control 30# word on per-buffer basis, then you'd have to reload key context, 31# which incurs penalty; b) if you choose to set 'align' flag 32# permanently, it limits performance even for aligned data to ~1/2. 33# All above mentioned results were collected on 1.5GHz C7. Nano on the 34# other hand handles unaligned data more gracefully. Depending on 35# algorithm and how unaligned data is, hardware can be up to 70% more 36# efficient than below software alignment procedures, nor does 'align' 37# flag have affect on aligned performance [if has any meaning at all]. 38# Therefore suggestion is to unconditionally set 'align' flag on Nano 39# for optimal performance. 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42push(@INC,"${dir}","${dir}../../crypto/perlasm"); 43require "x86asm.pl"; 44 45$output=pop and open STDOUT,">$output"; 46 47&asm_init($ARGV[0]); 48 49%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata 50$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 51 52$ctx="edx"; 53$out="edi"; 54$inp="esi"; 55$len="ecx"; 56$chunk="ebx"; 57 58&function_begin_B("padlock_capability"); 59 &push ("ebx"); 60 &pushf (); 61 &pop ("eax"); 62 &mov ("ecx","eax"); 63 &xor ("eax",1<<21); 64 &push ("eax"); 65 &popf (); 66 &pushf (); 67 &pop ("eax"); 68 &xor ("ecx","eax"); 69 &xor ("eax","eax"); 70 &bt ("ecx",21); 71 &jnc (&label("noluck")); 72 &cpuid (); 73 &xor ("eax","eax"); 74 &cmp ("ebx","0x".unpack("H*",'tneC')); 75 &jne (&label("zhaoxin")); 76 &cmp ("edx","0x".unpack("H*",'Hrua')); 77 &jne (&label("noluck")); 78 &cmp ("ecx","0x".unpack("H*",'slua')); 79 &jne (&label("noluck")); 80 &jmp (&label("zhaoxinEnd")); 81&set_label("zhaoxin"); 82 &cmp ("ebx","0x".unpack("H*",'hS ')); 83 &jne (&label("noluck")); 84 &cmp ("edx","0x".unpack("H*",'hgna')); 85 &jne (&label("noluck")); 86 &cmp ("ecx","0x".unpack("H*",' ia')); 87 &jne (&label("noluck")); 88&set_label("zhaoxinEnd"); 89 &mov ("eax",0xC0000000); 90 &cpuid (); 91 &mov ("edx","eax"); 92 &xor ("eax","eax"); 93 &cmp ("edx",0xC0000001); 94 &jb (&label("noluck")); 95 &mov ("eax",1); 96 &cpuid (); 97 &or ("eax",0x0f); 98 &xor ("ebx","ebx"); 99 &and ("eax",0x0fff); 100 &cmp ("eax",0x06ff); # check for Nano 101 &sete ("bl"); 102 &mov ("eax",0xC0000001); 103 &push ("ebx"); 104 &cpuid (); 105 &pop ("ebx"); 106 &mov ("eax","edx"); 107 &shl ("ebx",4); # bit#4 denotes Nano 108 &and ("eax",0xffffffef); 109 &or ("eax","ebx") 110&set_label("noluck"); 111 &pop ("ebx"); 112 &ret (); 113&function_end_B("padlock_capability") 114 115&function_begin_B("padlock_key_bswap"); 116 &mov ("edx",&wparam(0)); 117 &mov ("ecx",&DWP(240,"edx")); 118&set_label("bswap_loop"); 119 &mov ("eax",&DWP(0,"edx")); 120 &bswap ("eax"); 121 &mov (&DWP(0,"edx"),"eax"); 122 &lea ("edx",&DWP(4,"edx")); 123 &sub ("ecx",1); 124 &jnz (&label("bswap_loop")); 125 &ret (); 126&function_end_B("padlock_key_bswap"); 127 128# This is heuristic key context tracing. At first one 129# believes that one should use atomic swap instructions, 130# but it's not actually necessary. Point is that if 131# padlock_saved_context was changed by another thread 132# after we've read it and before we compare it with ctx, 133# our key *shall* be reloaded upon thread context switch 134# and we are therefore set in either case... 135&static_label("padlock_saved_context"); 136 137&function_begin_B("padlock_verify_context"); 138 &mov ($ctx,&wparam(0)); 139 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 140 &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); 141 &pushf (); 142 &call ("_padlock_verify_ctx"); 143&set_label("verify_pic_point"); 144 &lea ("esp",&DWP(4,"esp")); 145 &ret (); 146&function_end_B("padlock_verify_context"); 147 148&function_begin_B("_padlock_verify_ctx"); 149 &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context 150 &bt (&DWP(4,"esp"),30); # eflags 151 &jnc (&label("verified")); 152 &cmp ($ctx,&DWP(0,"eax")); 153 &je (&label("verified")); 154 &pushf (); 155 &popf (); 156&set_label("verified"); 157 &mov (&DWP(0,"eax"),$ctx); 158 &ret (); 159&function_end_B("_padlock_verify_ctx"); 160 161&function_begin_B("padlock_reload_key"); 162 &pushf (); 163 &popf (); 164 &ret (); 165&function_end_B("padlock_reload_key"); 166 167&function_begin_B("padlock_aes_block"); 168 &push ("edi"); 169 &push ("esi"); 170 &push ("ebx"); 171 &mov ($out,&wparam(0)); # must be 16-byte aligned 172 &mov ($inp,&wparam(1)); # must be 16-byte aligned 173 &mov ($ctx,&wparam(2)); 174 &mov ($len,1); 175 &lea ("ebx",&DWP(32,$ctx)); # key 176 &lea ($ctx,&DWP(16,$ctx)); # control word 177 &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb 178 &pop ("ebx"); 179 &pop ("esi"); 180 &pop ("edi"); 181 &ret (); 182&function_end_B("padlock_aes_block"); 183 184sub generate_mode { 185my ($mode,$opcode) = @_; 186# int padlock_$mode_encrypt(void *out, const void *inp, 187# struct padlock_cipher_data *ctx, size_t len); 188&function_begin("padlock_${mode}_encrypt"); 189 &mov ($out,&wparam(0)); 190 &mov ($inp,&wparam(1)); 191 &mov ($ctx,&wparam(2)); 192 &mov ($len,&wparam(3)); 193 &test ($ctx,15); 194 &jnz (&label("${mode}_abort")); 195 &test ($len,15); 196 &jnz (&label("${mode}_abort")); 197 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 198 &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); 199 &pushf (); 200 &cld (); 201 &call ("_padlock_verify_ctx"); 202&set_label("${mode}_pic_point"); 203 &lea ($ctx,&DWP(16,$ctx)); # control word 204 &xor ("eax","eax"); 205 if ($mode eq "ctr32") { 206 &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter 207 } else { 208 &xor ("ebx","ebx"); 209 &test (&DWP(0,$ctx),1<<5); # align bit in control word 210 &jnz (&label("${mode}_aligned")); 211 &test ($out,0x0f); 212 &setz ("al"); # !out_misaligned 213 &test ($inp,0x0f); 214 &setz ("bl"); # !inp_misaligned 215 &test ("eax","ebx"); 216 &jnz (&label("${mode}_aligned")); 217 &neg ("eax"); 218 } 219 &mov ($chunk,$PADLOCK_CHUNK); 220 ¬ ("eax"); # out_misaligned?-1:0 221 &lea ("ebp",&DWP(-24,"esp")); 222 &cmp ($len,$chunk); 223 &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 224 &and ("eax",$chunk); # out_misaligned?chunk:0 225 &mov ($chunk,$len); 226 &neg ("eax"); 227 &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK 228 &lea ("esp",&DWP(0,"eax","ebp")); # alloca 229 &mov ("eax",$PADLOCK_CHUNK); 230 &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK 231 &mov ("eax","ebp"); 232 &and ("ebp",-16); 233 &and ("esp",-16); 234 &mov (&DWP(16,"ebp"),"eax"); 235 if ($PADLOCK_PREFETCH{$mode}) { 236 &cmp ($len,$chunk); 237 &ja (&label("${mode}_loop")); 238 &mov ("eax",$inp); # check if prefetch crosses page 239 &cmp ("ebp","esp"); 240 &cmove ("eax",$out); 241 &add ("eax",$len); 242 &neg ("eax"); 243 &and ("eax",0xfff); # distance to page boundary 244 &cmp ("eax",$PADLOCK_PREFETCH{$mode}); 245 &mov ("eax",-$PADLOCK_PREFETCH{$mode}); 246 &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 247 &and ($chunk,"eax"); 248 &jz (&label("${mode}_unaligned_tail")); 249 } 250 &jmp (&label("${mode}_loop")); 251 252&set_label("${mode}_loop",16); 253 &mov (&DWP(0,"ebp"),$out); # save parameters 254 &mov (&DWP(4,"ebp"),$inp); 255 &mov (&DWP(8,"ebp"),$len); 256 &mov ($len,$chunk); 257 &mov (&DWP(12,"ebp"),$chunk); # chunk 258 if ($mode eq "ctr32") { 259 &mov ("ecx",&DWP(-4,$ctx)); 260 &xor ($out,$out); 261 &mov ("eax",&DWP(-8,$ctx)); # borrow $len 262&set_label("${mode}_prepare"); 263 &mov (&DWP(12,"esp",$out),"ecx"); 264 &bswap ("ecx"); 265 &movq (&QWP(0,"esp",$out),"mm0"); 266 &inc ("ecx"); 267 &mov (&DWP(8,"esp",$out),"eax"); 268 &bswap ("ecx"); 269 &lea ($out,&DWP(16,$out)); 270 &cmp ($out,$chunk); 271 &jb (&label("${mode}_prepare")); 272 273 &mov (&DWP(-4,$ctx),"ecx"); 274 &lea ($inp,&DWP(0,"esp")); 275 &lea ($out,&DWP(0,"esp")); 276 &mov ($len,$chunk); 277 } else { 278 &test ($out,0x0f); # out_misaligned 279 &cmovnz ($out,"esp"); 280 &test ($inp,0x0f); # inp_misaligned 281 &jz (&label("${mode}_inp_aligned")); 282 &shr ($len,2); 283 &data_byte(0xf3,0xa5); # rep movsl 284 &sub ($out,$chunk); 285 &mov ($len,$chunk); 286 &mov ($inp,$out); 287&set_label("${mode}_inp_aligned"); 288 } 289 &lea ("eax",&DWP(-16,$ctx)); # ivp 290 &lea ("ebx",&DWP(16,$ctx)); # key 291 &shr ($len,4); # len/=AES_BLOCK_SIZE 292 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 293 if ($mode !~ /ecb|ctr/) { 294 &movaps ("xmm0",&QWP(0,"eax")); 295 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 296 } 297 &mov ($out,&DWP(0,"ebp")); # restore parameters 298 &mov ($chunk,&DWP(12,"ebp")); 299 if ($mode eq "ctr32") { 300 &mov ($inp,&DWP(4,"ebp")); 301 &xor ($len,$len); 302&set_label("${mode}_xor"); 303 &movups ("xmm1",&QWP(0,$inp,$len)); 304 &lea ($len,&DWP(16,$len)); 305 &pxor ("xmm1",&QWP(-16,"esp",$len)); 306 &movups (&QWP(-16,$out,$len),"xmm1"); 307 &cmp ($len,$chunk); 308 &jb (&label("${mode}_xor")); 309 } else { 310 &test ($out,0x0f); 311 &jz (&label("${mode}_out_aligned")); 312 &mov ($len,$chunk); 313 &lea ($inp,&DWP(0,"esp")); 314 &shr ($len,2); 315 &data_byte(0xf3,0xa5); # rep movsl 316 &sub ($out,$chunk); 317&set_label("${mode}_out_aligned"); 318 &mov ($inp,&DWP(4,"ebp")); 319 } 320 &mov ($len,&DWP(8,"ebp")); 321 &add ($out,$chunk); 322 &add ($inp,$chunk); 323 &sub ($len,$chunk); 324 &mov ($chunk,$PADLOCK_CHUNK); 325 if (!$PADLOCK_PREFETCH{$mode}) { 326 &jnz (&label("${mode}_loop")); 327 } else { 328 &jz (&label("${mode}_break")); 329 &cmp ($len,$chunk); 330 &jae (&label("${mode}_loop")); 331 332&set_label("${mode}_unaligned_tail"); 333 &xor ("eax","eax"); 334 &cmp ("esp","ebp"); 335 &cmove ("eax",$len); 336 &sub ("esp","eax"); # alloca 337 &mov ("eax", $out); # save parameters 338 &mov ($chunk,$len); 339 &shr ($len,2); 340 &lea ($out,&DWP(0,"esp")); 341 &data_byte(0xf3,0xa5); # rep movsl 342 &mov ($inp,"esp"); 343 &mov ($out,"eax"); # restore parameters 344 &mov ($len,$chunk); 345 &jmp (&label("${mode}_loop")); 346 347&set_label("${mode}_break",16); 348 } 349 if ($mode ne "ctr32") { 350 &cmp ("esp","ebp"); 351 &je (&label("${mode}_done")); 352 } 353 &pxor ("xmm0","xmm0"); 354 &lea ("eax",&DWP(0,"esp")); 355&set_label("${mode}_bzero"); 356 &movaps (&QWP(0,"eax"),"xmm0"); 357 &lea ("eax",&DWP(16,"eax")); 358 &cmp ("ebp","eax"); 359 &ja (&label("${mode}_bzero")); 360 361&set_label("${mode}_done"); 362 &mov ("ebp",&DWP(16,"ebp")); 363 &lea ("esp",&DWP(24,"ebp")); 364 if ($mode ne "ctr32") { 365 &jmp (&label("${mode}_exit")); 366 367&set_label("${mode}_aligned",16); 368 if ($PADLOCK_PREFETCH{$mode}) { 369 &lea ("ebp",&DWP(0,$inp,$len)); 370 &neg ("ebp"); 371 &and ("ebp",0xfff); # distance to page boundary 372 &xor ("eax","eax"); 373 &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); 374 &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); 375 &cmovae ("ebp","eax"); 376 &and ("ebp",$len); # remainder 377 &sub ($len,"ebp"); 378 &jz (&label("${mode}_aligned_tail")); 379 } 380 &lea ("eax",&DWP(-16,$ctx)); # ivp 381 &lea ("ebx",&DWP(16,$ctx)); # key 382 &shr ($len,4); # len/=AES_BLOCK_SIZE 383 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 384 if ($mode ne "ecb") { 385 &movaps ("xmm0",&QWP(0,"eax")); 386 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 387 } 388 if ($PADLOCK_PREFETCH{$mode}) { 389 &test ("ebp","ebp"); 390 &jz (&label("${mode}_exit")); 391 392&set_label("${mode}_aligned_tail"); 393 &mov ($len,"ebp"); 394 &lea ("ebp",&DWP(-24,"esp")); 395 &mov ("esp","ebp"); 396 &mov ("eax","ebp"); 397 &sub ("esp",$len); 398 &and ("ebp",-16); 399 &and ("esp",-16); 400 &mov (&DWP(16,"ebp"),"eax"); 401 &mov ("eax", $out); # save parameters 402 &mov ($chunk,$len); 403 &shr ($len,2); 404 &lea ($out,&DWP(0,"esp")); 405 &data_byte(0xf3,0xa5); # rep movsl 406 &mov ($inp,"esp"); 407 &mov ($out,"eax"); # restore parameters 408 &mov ($len,$chunk); 409 &jmp (&label("${mode}_loop")); 410 } 411&set_label("${mode}_exit"); } 412 &mov ("eax",1); 413 &lea ("esp",&DWP(4,"esp")); # popf 414 &emms () if ($mode eq "ctr32"); 415&set_label("${mode}_abort"); 416&function_end("padlock_${mode}_encrypt"); 417} 418 419&generate_mode("ecb",0xc8); 420&generate_mode("cbc",0xd0); 421&generate_mode("cfb",0xe0); 422&generate_mode("ofb",0xe8); 423&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, 424 # because hardware CTR was introduced later 425 # and even has errata on certain C7 stepping. 426 # own implementation *always* works, though 427 # ~15% slower than dedicated hardware... 428 429&function_begin_B("padlock_xstore"); 430 &push ("edi"); 431 &mov ("edi",&wparam(0)); 432 &mov ("edx",&wparam(1)); 433 &data_byte(0x0f,0xa7,0xc0); # xstore 434 &pop ("edi"); 435 &ret (); 436&function_end_B("padlock_xstore"); 437 438&function_begin_B("_win32_segv_handler"); 439 &mov ("eax",1); # ExceptionContinueSearch 440 &mov ("edx",&wparam(0)); # *ExceptionRecord 441 &mov ("ecx",&wparam(2)); # *ContextRecord 442 &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION 443 &jne (&label("ret")); 444 &add (&DWP(184,"ecx"),4); # skip over rep sha* 445 &mov ("eax",0); # ExceptionContinueExecution 446&set_label("ret"); 447 &ret (); 448&function_end_B("_win32_segv_handler"); 449&safeseh("_win32_segv_handler") if ($::win32); 450 451&function_begin_B("padlock_sha1_oneshot"); 452 &push ("edi"); 453 &push ("esi"); 454 &xor ("eax","eax"); 455 &mov ("edi",&wparam(0)); 456 &mov ("esi",&wparam(1)); 457 &mov ("ecx",&wparam(2)); 458 if ($::win32 or $::coff) { 459 &push (&::islabel("_win32_segv_handler")); 460 &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 461 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 462 } 463 &mov ("edx","esp"); # put aside %esp 464 &add ("esp",-128); # 32 is enough but spec says 128 465 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 466 &and ("esp",-16); 467 &mov ("eax",&DWP(16,"edi")); 468 &movaps (&QWP(0,"esp"),"xmm0"); 469 &mov ("edi","esp"); 470 &mov (&DWP(16,"esp"),"eax"); 471 &xor ("eax","eax"); 472 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 473 &movaps ("xmm0",&QWP(0,"esp")); 474 &mov ("eax",&DWP(16,"esp")); 475 &mov ("esp","edx"); # restore %esp 476 if ($::win32 or $::coff) { 477 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 478 &lea ("esp",&DWP(4,"esp")); 479 } 480 &mov ("edi",&wparam(0)); 481 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 482 &mov (&DWP(16,"edi"),"eax"); 483 &pop ("esi"); 484 &pop ("edi"); 485 &ret (); 486&function_end_B("padlock_sha1_oneshot"); 487 488&function_begin_B("padlock_sha1_blocks"); 489 &push ("edi"); 490 &push ("esi"); 491 &mov ("edi",&wparam(0)); 492 &mov ("esi",&wparam(1)); 493 &mov ("edx","esp"); # put aside %esp 494 &mov ("ecx",&wparam(2)); 495 &add ("esp",-128); 496 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 497 &and ("esp",-16); 498 &mov ("eax",&DWP(16,"edi")); 499 &movaps (&QWP(0,"esp"),"xmm0"); 500 &mov ("edi","esp"); 501 &mov (&DWP(16,"esp"),"eax"); 502 &mov ("eax",-1); 503 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 504 &movaps ("xmm0",&QWP(0,"esp")); 505 &mov ("eax",&DWP(16,"esp")); 506 &mov ("esp","edx"); # restore %esp 507 &mov ("edi",&wparam(0)); 508 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 509 &mov (&DWP(16,"edi"),"eax"); 510 &pop ("esi"); 511 &pop ("edi"); 512 &ret (); 513&function_end_B("padlock_sha1_blocks"); 514 515&function_begin_B("padlock_sha256_oneshot"); 516 &push ("edi"); 517 &push ("esi"); 518 &xor ("eax","eax"); 519 &mov ("edi",&wparam(0)); 520 &mov ("esi",&wparam(1)); 521 &mov ("ecx",&wparam(2)); 522 if ($::win32 or $::coff) { 523 &push (&::islabel("_win32_segv_handler")); 524 &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 525 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 526 } 527 &mov ("edx","esp"); # put aside %esp 528 &add ("esp",-128); 529 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 530 &and ("esp",-16); 531 &movups ("xmm1",&QWP(16,"edi")); 532 &movaps (&QWP(0,"esp"),"xmm0"); 533 &mov ("edi","esp"); 534 &movaps (&QWP(16,"esp"),"xmm1"); 535 &xor ("eax","eax"); 536 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 537 &movaps ("xmm0",&QWP(0,"esp")); 538 &movaps ("xmm1",&QWP(16,"esp")); 539 &mov ("esp","edx"); # restore %esp 540 if ($::win32 or $::coff) { 541 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 542 &lea ("esp",&DWP(4,"esp")); 543 } 544 &mov ("edi",&wparam(0)); 545 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 546 &movups (&QWP(16,"edi"),"xmm1"); 547 &pop ("esi"); 548 &pop ("edi"); 549 &ret (); 550&function_end_B("padlock_sha256_oneshot"); 551 552&function_begin_B("padlock_sha256_blocks"); 553 &push ("edi"); 554 &push ("esi"); 555 &mov ("edi",&wparam(0)); 556 &mov ("esi",&wparam(1)); 557 &mov ("ecx",&wparam(2)); 558 &mov ("edx","esp"); # put aside %esp 559 &add ("esp",-128); 560 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 561 &and ("esp",-16); 562 &movups ("xmm1",&QWP(16,"edi")); 563 &movaps (&QWP(0,"esp"),"xmm0"); 564 &mov ("edi","esp"); 565 &movaps (&QWP(16,"esp"),"xmm1"); 566 &mov ("eax",-1); 567 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 568 &movaps ("xmm0",&QWP(0,"esp")); 569 &movaps ("xmm1",&QWP(16,"esp")); 570 &mov ("esp","edx"); # restore %esp 571 &mov ("edi",&wparam(0)); 572 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 573 &movups (&QWP(16,"edi"),"xmm1"); 574 &pop ("esi"); 575 &pop ("edi"); 576 &ret (); 577&function_end_B("padlock_sha256_blocks"); 578 579&function_begin_B("padlock_sha512_blocks"); 580 &push ("edi"); 581 &push ("esi"); 582 &mov ("edi",&wparam(0)); 583 &mov ("esi",&wparam(1)); 584 &mov ("ecx",&wparam(2)); 585 &mov ("edx","esp"); # put aside %esp 586 &add ("esp",-128); 587 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 588 &and ("esp",-16); 589 &movups ("xmm1",&QWP(16,"edi")); 590 &movups ("xmm2",&QWP(32,"edi")); 591 &movups ("xmm3",&QWP(48,"edi")); 592 &movaps (&QWP(0,"esp"),"xmm0"); 593 &mov ("edi","esp"); 594 &movaps (&QWP(16,"esp"),"xmm1"); 595 &movaps (&QWP(32,"esp"),"xmm2"); 596 &movaps (&QWP(48,"esp"),"xmm3"); 597 &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 598 &movaps ("xmm0",&QWP(0,"esp")); 599 &movaps ("xmm1",&QWP(16,"esp")); 600 &movaps ("xmm2",&QWP(32,"esp")); 601 &movaps ("xmm3",&QWP(48,"esp")); 602 &mov ("esp","edx"); # restore %esp 603 &mov ("edi",&wparam(0)); 604 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 605 &movups (&QWP(16,"edi"),"xmm1"); 606 &movups (&QWP(32,"edi"),"xmm2"); 607 &movups (&QWP(48,"edi"),"xmm3"); 608 &pop ("esi"); 609 &pop ("edi"); 610 &ret (); 611&function_end_B("padlock_sha512_blocks"); 612 613&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); 614&align (16); 615 616&dataseg(); 617# Essentially this variable belongs in thread local storage. 618# Having this variable global on the other hand can only cause 619# few bogus key reloads [if any at all on signle-CPU system], 620# so we accept the penalty... 621&set_label("padlock_saved_context",4); 622&data_word(0); 623 624&asm_finish(); 625 626close STDOUT; 627