1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# January 2015 11# 12# ChaCha20 for x86. 13# 14# Performance in cycles per byte out of large buffer. 15# 16# 1xIALU/gcc 4xSSSE3 17# Pentium 17.5/+80% 18# PIII 14.2/+60% 19# P4 18.6/+84% 20# Core2 9.56/+89% 4.83 21# Westmere 9.50/+45% 3.35 22# Sandy Bridge 10.5/+47% 3.20 23# Haswell 8.15/+50% 2.83 24# Skylake 7.53/+22% 2.75 25# Silvermont 17.4/+36% 8.35 26# Goldmont 13.4/+40% 4.36 27# Sledgehammer 10.2/+54% 28# Bulldozer 13.4/+50% 4.38(*) 29# 30# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; 31# 32# Modified from upstream OpenSSL to remove the XOP code. 33 34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 35push(@INC,"${dir}","${dir}../../perlasm"); 36require "x86asm.pl"; 37 38$output=pop; 39open STDOUT,">$output"; 40 41&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 42 43$xmm=$ymm=1; 44$gasver=999; # enable everything 45 46$a="eax"; 47($b,$b_)=("ebx","ebp"); 48($c,$c_)=("ecx","esi"); 49($d,$d_)=("edx","edi"); 50 51sub QUARTERROUND { 52my ($ai,$bi,$ci,$di,$i)=@_; 53my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 54my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 55 56 # a b c d 57 # 58 # 0 4 8 12 < even round 59 # 1 5 9 13 60 # 2 6 10 14 61 # 3 7 11 15 62 # 0 5 10 15 < odd round 63 # 1 6 11 12 64 # 2 7 8 13 65 # 3 4 9 14 66 67 if ($i==0) { 68 my $j=4; 69 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 70 } elsif ($i==3) { 71 my $j=0; 72 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 73 } elsif ($i==4) { 74 my $j=4; 75 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 76 } elsif ($i==7) { 77 my $j=0; 78 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 79 } 80 81 #&add ($a,$b); # see elsewhere 82 &xor ($d,$a); 83 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); 84 &rol ($d,16); 85 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); 86 &add ($c,$d); 87 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); 88 &xor ($b,$c); 89 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); 90 &rol ($b,12); 91 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); 92 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter 93 &add ($a,$b); 94 &xor ($d,$a); 95 &mov (&DWP(4*$ai,"esp"),$a); 96 &rol ($d,8); 97 &mov ($a,&DWP(4*$an,"esp")); 98 &add ($c,$d); 99 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); 100 &mov ($d_,$d) if ($di==$dn); 101 &xor ($b,$c); 102 &add ($a,$b_) if ($i<7); # elsewhere 103 &rol ($b,7); 104 105 ($b,$b_)=($b_,$b); 106 ($c,$c_)=($c_,$c); 107 ($d,$d_)=($d_,$d); 108} 109 110&static_label("ssse3_shortcut"); 111&static_label("ssse3_data"); 112&static_label("pic_point"); 113 114&function_begin("ChaCha20_ctr32"); 115 &xor ("eax","eax"); 116 &cmp ("eax",&wparam(2)); # len==0? 117 &je (&label("no_data")); 118if ($xmm) { 119 &call (&label("pic_point")); 120&set_label("pic_point"); 121 &blindpop("eax"); 122 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); 123 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit 124 &jz (&label("x86")); 125 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit 126 &jz (&label("x86")); 127 &jmp (&label("ssse3_shortcut")); 128&set_label("x86"); 129} 130 &mov ("esi",&wparam(3)); # key 131 &mov ("edi",&wparam(4)); # counter and nonce 132 133 &stack_push(33); 134 135 &mov ("eax",&DWP(4*0,"esi")); # copy key 136 &mov ("ebx",&DWP(4*1,"esi")); 137 &mov ("ecx",&DWP(4*2,"esi")); 138 &mov ("edx",&DWP(4*3,"esi")); 139 &mov (&DWP(64+4*4,"esp"),"eax"); 140 &mov (&DWP(64+4*5,"esp"),"ebx"); 141 &mov (&DWP(64+4*6,"esp"),"ecx"); 142 &mov (&DWP(64+4*7,"esp"),"edx"); 143 &mov ("eax",&DWP(4*4,"esi")); 144 &mov ("ebx",&DWP(4*5,"esi")); 145 &mov ("ecx",&DWP(4*6,"esi")); 146 &mov ("edx",&DWP(4*7,"esi")); 147 &mov (&DWP(64+4*8,"esp"),"eax"); 148 &mov (&DWP(64+4*9,"esp"),"ebx"); 149 &mov (&DWP(64+4*10,"esp"),"ecx"); 150 &mov (&DWP(64+4*11,"esp"),"edx"); 151 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce 152 &mov ("ebx",&DWP(4*1,"edi")); 153 &mov ("ecx",&DWP(4*2,"edi")); 154 &mov ("edx",&DWP(4*3,"edi")); 155 &sub ("eax",1); 156 &mov (&DWP(64+4*12,"esp"),"eax"); 157 &mov (&DWP(64+4*13,"esp"),"ebx"); 158 &mov (&DWP(64+4*14,"esp"),"ecx"); 159 &mov (&DWP(64+4*15,"esp"),"edx"); 160 &jmp (&label("entry")); 161 162&set_label("outer_loop",16); 163 &mov (&wparam(1),$b); # save input 164 &mov (&wparam(0),$a); # save output 165 &mov (&wparam(2),$c); # save len 166&set_label("entry"); 167 &mov ($a,0x61707865); 168 &mov (&DWP(4*1,"esp"),0x3320646e); 169 &mov (&DWP(4*2,"esp"),0x79622d32); 170 &mov (&DWP(4*3,"esp"),0x6b206574); 171 172 &mov ($b, &DWP(64+4*5,"esp")); # copy key material 173 &mov ($b_,&DWP(64+4*6,"esp")); 174 &mov ($c, &DWP(64+4*10,"esp")); 175 &mov ($c_,&DWP(64+4*11,"esp")); 176 &mov ($d, &DWP(64+4*13,"esp")); 177 &mov ($d_,&DWP(64+4*14,"esp")); 178 &mov (&DWP(4*5,"esp"),$b); 179 &mov (&DWP(4*6,"esp"),$b_); 180 &mov (&DWP(4*10,"esp"),$c); 181 &mov (&DWP(4*11,"esp"),$c_); 182 &mov (&DWP(4*13,"esp"),$d); 183 &mov (&DWP(4*14,"esp"),$d_); 184 185 &mov ($b, &DWP(64+4*7,"esp")); 186 &mov ($d_,&DWP(64+4*15,"esp")); 187 &mov ($d, &DWP(64+4*12,"esp")); 188 &mov ($b_,&DWP(64+4*4,"esp")); 189 &mov ($c, &DWP(64+4*8,"esp")); 190 &mov ($c_,&DWP(64+4*9,"esp")); 191 &add ($d,1); # counter value 192 &mov (&DWP(4*7,"esp"),$b); 193 &mov (&DWP(4*15,"esp"),$d_); 194 &mov (&DWP(64+4*12,"esp"),$d); # save counter value 195 196 &mov ($b,10); # loop counter 197 &jmp (&label("loop")); 198 199&set_label("loop",16); 200 &add ($a,$b_); # elsewhere 201 &mov (&DWP(128,"esp"),$b); # save loop counter 202 &mov ($b,$b_); 203 &QUARTERROUND(0, 4, 8, 12, 0); 204 &QUARTERROUND(1, 5, 9, 13, 1); 205 &QUARTERROUND(2, 6,10, 14, 2); 206 &QUARTERROUND(3, 7,11, 15, 3); 207 &QUARTERROUND(0, 5,10, 15, 4); 208 &QUARTERROUND(1, 6,11, 12, 5); 209 &QUARTERROUND(2, 7, 8, 13, 6); 210 &QUARTERROUND(3, 4, 9, 14, 7); 211 &dec ($b); 212 &jnz (&label("loop")); 213 214 &mov ($b,&wparam(2)); # load len 215 216 &add ($a,0x61707865); # accumulate key material 217 &add ($b_,&DWP(64+4*4,"esp")); 218 &add ($c, &DWP(64+4*8,"esp")); 219 &add ($c_,&DWP(64+4*9,"esp")); 220 221 &cmp ($b,64); 222 &jb (&label("tail")); 223 224 &mov ($b,&wparam(1)); # load input pointer 225 &add ($d, &DWP(64+4*12,"esp")); 226 &add ($d_,&DWP(64+4*14,"esp")); 227 228 &xor ($a, &DWP(4*0,$b)); # xor with input 229 &xor ($b_,&DWP(4*4,$b)); 230 &mov (&DWP(4*0,"esp"),$a); 231 &mov ($a,&wparam(0)); # load output pointer 232 &xor ($c, &DWP(4*8,$b)); 233 &xor ($c_,&DWP(4*9,$b)); 234 &xor ($d, &DWP(4*12,$b)); 235 &xor ($d_,&DWP(4*14,$b)); 236 &mov (&DWP(4*4,$a),$b_); # write output 237 &mov (&DWP(4*8,$a),$c); 238 &mov (&DWP(4*9,$a),$c_); 239 &mov (&DWP(4*12,$a),$d); 240 &mov (&DWP(4*14,$a),$d_); 241 242 &mov ($b_,&DWP(4*1,"esp")); 243 &mov ($c, &DWP(4*2,"esp")); 244 &mov ($c_,&DWP(4*3,"esp")); 245 &mov ($d, &DWP(4*5,"esp")); 246 &mov ($d_,&DWP(4*6,"esp")); 247 &add ($b_,0x3320646e); # accumulate key material 248 &add ($c, 0x79622d32); 249 &add ($c_,0x6b206574); 250 &add ($d, &DWP(64+4*5,"esp")); 251 &add ($d_,&DWP(64+4*6,"esp")); 252 &xor ($b_,&DWP(4*1,$b)); 253 &xor ($c, &DWP(4*2,$b)); 254 &xor ($c_,&DWP(4*3,$b)); 255 &xor ($d, &DWP(4*5,$b)); 256 &xor ($d_,&DWP(4*6,$b)); 257 &mov (&DWP(4*1,$a),$b_); 258 &mov (&DWP(4*2,$a),$c); 259 &mov (&DWP(4*3,$a),$c_); 260 &mov (&DWP(4*5,$a),$d); 261 &mov (&DWP(4*6,$a),$d_); 262 263 &mov ($b_,&DWP(4*7,"esp")); 264 &mov ($c, &DWP(4*10,"esp")); 265 &mov ($c_,&DWP(4*11,"esp")); 266 &mov ($d, &DWP(4*13,"esp")); 267 &mov ($d_,&DWP(4*15,"esp")); 268 &add ($b_,&DWP(64+4*7,"esp")); 269 &add ($c, &DWP(64+4*10,"esp")); 270 &add ($c_,&DWP(64+4*11,"esp")); 271 &add ($d, &DWP(64+4*13,"esp")); 272 &add ($d_,&DWP(64+4*15,"esp")); 273 &xor ($b_,&DWP(4*7,$b)); 274 &xor ($c, &DWP(4*10,$b)); 275 &xor ($c_,&DWP(4*11,$b)); 276 &xor ($d, &DWP(4*13,$b)); 277 &xor ($d_,&DWP(4*15,$b)); 278 &lea ($b,&DWP(4*16,$b)); 279 &mov (&DWP(4*7,$a),$b_); 280 &mov ($b_,&DWP(4*0,"esp")); 281 &mov (&DWP(4*10,$a),$c); 282 &mov ($c,&wparam(2)); # len 283 &mov (&DWP(4*11,$a),$c_); 284 &mov (&DWP(4*13,$a),$d); 285 &mov (&DWP(4*15,$a),$d_); 286 &mov (&DWP(4*0,$a),$b_); 287 &lea ($a,&DWP(4*16,$a)); 288 &sub ($c,64); 289 &jnz (&label("outer_loop")); 290 291 &jmp (&label("done")); 292 293&set_label("tail"); 294 &add ($d, &DWP(64+4*12,"esp")); 295 &add ($d_,&DWP(64+4*14,"esp")); 296 &mov (&DWP(4*0,"esp"),$a); 297 &mov (&DWP(4*4,"esp"),$b_); 298 &mov (&DWP(4*8,"esp"),$c); 299 &mov (&DWP(4*9,"esp"),$c_); 300 &mov (&DWP(4*12,"esp"),$d); 301 &mov (&DWP(4*14,"esp"),$d_); 302 303 &mov ($b_,&DWP(4*1,"esp")); 304 &mov ($c, &DWP(4*2,"esp")); 305 &mov ($c_,&DWP(4*3,"esp")); 306 &mov ($d, &DWP(4*5,"esp")); 307 &mov ($d_,&DWP(4*6,"esp")); 308 &add ($b_,0x3320646e); # accumulate key material 309 &add ($c, 0x79622d32); 310 &add ($c_,0x6b206574); 311 &add ($d, &DWP(64+4*5,"esp")); 312 &add ($d_,&DWP(64+4*6,"esp")); 313 &mov (&DWP(4*1,"esp"),$b_); 314 &mov (&DWP(4*2,"esp"),$c); 315 &mov (&DWP(4*3,"esp"),$c_); 316 &mov (&DWP(4*5,"esp"),$d); 317 &mov (&DWP(4*6,"esp"),$d_); 318 319 &mov ($b_,&DWP(4*7,"esp")); 320 &mov ($c, &DWP(4*10,"esp")); 321 &mov ($c_,&DWP(4*11,"esp")); 322 &mov ($d, &DWP(4*13,"esp")); 323 &mov ($d_,&DWP(4*15,"esp")); 324 &add ($b_,&DWP(64+4*7,"esp")); 325 &add ($c, &DWP(64+4*10,"esp")); 326 &add ($c_,&DWP(64+4*11,"esp")); 327 &add ($d, &DWP(64+4*13,"esp")); 328 &add ($d_,&DWP(64+4*15,"esp")); 329 &mov (&DWP(4*7,"esp"),$b_); 330 &mov ($b_,&wparam(1)); # load input 331 &mov (&DWP(4*10,"esp"),$c); 332 &mov ($c,&wparam(0)); # load output 333 &mov (&DWP(4*11,"esp"),$c_); 334 &xor ($c_,$c_); 335 &mov (&DWP(4*13,"esp"),$d); 336 &mov (&DWP(4*15,"esp"),$d_); 337 338 &xor ("eax","eax"); 339 &xor ("edx","edx"); 340&set_label("tail_loop"); 341 &movb ("al",&BP(0,$c_,$b_)); 342 &movb ("dl",&BP(0,"esp",$c_)); 343 &lea ($c_,&DWP(1,$c_)); 344 &xor ("al","dl"); 345 &mov (&BP(-1,$c,$c_),"al"); 346 &dec ($b); 347 &jnz (&label("tail_loop")); 348 349&set_label("done"); 350 &stack_pop(33); 351&set_label("no_data"); 352&function_end("ChaCha20_ctr32"); 353 354if ($xmm) { 355my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 356my ($out,$inp,$len)=("edi","esi","ecx"); 357 358sub QUARTERROUND_SSSE3 { 359my ($ai,$bi,$ci,$di,$i)=@_; 360my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 361my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 362 363 # a b c d 364 # 365 # 0 4 8 12 < even round 366 # 1 5 9 13 367 # 2 6 10 14 368 # 3 7 11 15 369 # 0 5 10 15 < odd round 370 # 1 6 11 12 371 # 2 7 8 13 372 # 3 4 9 14 373 374 if ($i==0) { 375 my $j=4; 376 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 377 } elsif ($i==3) { 378 my $j=0; 379 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 380 } elsif ($i==4) { 381 my $j=4; 382 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 383 } elsif ($i==7) { 384 my $j=0; 385 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 386 } 387 388 #&paddd ($xa,$xb); # see elsewhere 389 #&pxor ($xd,$xa); # see elsewhere 390 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 391 &pshufb ($xd,&QWP(0,"eax")); # rot16 392 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 393 &paddd ($xc,$xd); 394 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 395 &pxor ($xb,$xc); 396 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 397 &movdqa ($xa_,$xb); # borrow as temporary 398 &pslld ($xb,12); 399 &psrld ($xa_,20); 400 &por ($xb,$xa_); 401 &movdqa($xa_,&QWP(16*$an-128,"ebx")); 402 &paddd ($xa,$xb); 403 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 404 &pxor ($xd,$xa); 405 &movdqa (&QWP(16*$ai-128,"ebx"),$xa); 406 &pshufb ($xd,&QWP(16,"eax")); # rot8 407 &paddd ($xc,$xd); 408 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 409 &movdqa ($xd_,$xd) if ($di==$dn); 410 &pxor ($xb,$xc); 411 &paddd ($xa_,$xb_) if ($i<7); # elsewhere 412 &movdqa ($xa,$xb); # borrow as temporary 413 &pslld ($xb,7); 414 &psrld ($xa,25); 415 &pxor ($xd_,$xa_) if ($i<7); # elsewhere 416 &por ($xb,$xa); 417 418 ($xa,$xa_)=($xa_,$xa); 419 ($xb,$xb_)=($xb_,$xb); 420 ($xc,$xc_)=($xc_,$xc); 421 ($xd,$xd_)=($xd_,$xd); 422} 423 424&function_begin("ChaCha20_ssse3"); 425&set_label("ssse3_shortcut"); 426 &mov ($out,&wparam(0)); 427 &mov ($inp,&wparam(1)); 428 &mov ($len,&wparam(2)); 429 &mov ("edx",&wparam(3)); # key 430 &mov ("ebx",&wparam(4)); # counter and nonce 431 432 &mov ("ebp","esp"); 433 &stack_push (131); 434 &and ("esp",-64); 435 &mov (&DWP(512,"esp"),"ebp"); 436 437 &lea ("eax",&DWP(&label("ssse3_data")."-". 438 &label("pic_point"),"eax")); 439 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 440 441if (defined($gasver) && $gasver>=2.17) { # even though we encode 442 # pshufb manually, we 443 # handle only register 444 # operands, while this 445 # segment uses memory 446 # operand... 447 &cmp ($len,64*4); 448 &jb (&label("1x")); 449 450 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 451 &mov (&DWP(512+8,"esp"),"ebx"); 452 &sub ($len,64*4); # bias len 453 &lea ("ebp",&DWP(256+128,"esp")); # size optimization 454 455 &movdqu ("xmm7",&QWP(0,"edx")); # key 456 &pshufd ("xmm0","xmm3",0x00); 457 &pshufd ("xmm1","xmm3",0x55); 458 &pshufd ("xmm2","xmm3",0xaa); 459 &pshufd ("xmm3","xmm3",0xff); 460 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters 461 &pshufd ("xmm4","xmm7",0x00); 462 &pshufd ("xmm5","xmm7",0x55); 463 &psubd ("xmm0",&QWP(16*4,"eax")); 464 &pshufd ("xmm6","xmm7",0xaa); 465 &pshufd ("xmm7","xmm7",0xff); 466 &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); 467 &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); 468 &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); 469 &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); 470 &movdqu ("xmm3",&QWP(16,"edx")); # key 471 &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); 472 &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); 473 &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); 474 &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); 475 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma 476 &lea ("ebx",&DWP(128,"esp")); # size optimization 477 478 &pshufd ("xmm0","xmm3",0x00); 479 &pshufd ("xmm1","xmm3",0x55); 480 &pshufd ("xmm2","xmm3",0xaa); 481 &pshufd ("xmm3","xmm3",0xff); 482 &pshufd ("xmm4","xmm7",0x00); 483 &pshufd ("xmm5","xmm7",0x55); 484 &pshufd ("xmm6","xmm7",0xaa); 485 &pshufd ("xmm7","xmm7",0xff); 486 &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); 487 &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); 488 &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); 489 &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); 490 &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); 491 &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); 492 &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); 493 &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); 494 495 &lea ($inp,&DWP(128,$inp)); # size optimization 496 &lea ($out,&DWP(128,$out)); # size optimization 497 &jmp (&label("outer_loop")); 498 499&set_label("outer_loop",16); 500 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 501 &movdqa ("xmm1",&QWP(16*1-128,"ebp")); 502 &movdqa ("xmm2",&QWP(16*2-128,"ebp")); 503 &movdqa ("xmm3",&QWP(16*3-128,"ebp")); 504 #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); 505 &movdqa ("xmm5",&QWP(16*5-128,"ebp")); 506 &movdqa ("xmm6",&QWP(16*6-128,"ebp")); 507 &movdqa ("xmm7",&QWP(16*7-128,"ebp")); 508 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); 509 &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); 510 &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); 511 &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); 512 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); 513 &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); 514 &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); 515 &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); 516 #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); 517 #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); 518 &movdqa ("xmm2",&QWP(16*10-128,"ebp")); 519 &movdqa ("xmm3",&QWP(16*11-128,"ebp")); 520 &movdqa ("xmm4",&QWP(16*12-128,"ebp")); 521 &movdqa ("xmm5",&QWP(16*13-128,"ebp")); 522 &movdqa ("xmm6",&QWP(16*14-128,"ebp")); 523 &movdqa ("xmm7",&QWP(16*15-128,"ebp")); 524 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value 525 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); 526 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); 527 &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); 528 &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); 529 &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); 530 &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); 531 &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); 532 &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); 533 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 534 535 &movdqa ($xa, &QWP(16*0-128,"ebp")); 536 &movdqa ($xd, "xmm4"); 537 &movdqa ($xb_,&QWP(16*4-128,"ebp")); 538 &movdqa ($xc, &QWP(16*8-128,"ebp")); 539 &movdqa ($xc_,&QWP(16*9-128,"ebp")); 540 541 &mov ("edx",10); # loop counter 542 &nop (); 543 544&set_label("loop",16); 545 &paddd ($xa,$xb_); # elsewhere 546 &movdqa ($xb,$xb_); 547 &pxor ($xd,$xa); # elsewhere 548 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); 549 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); 550 &QUARTERROUND_SSSE3(2, 6,10, 14, 2); 551 &QUARTERROUND_SSSE3(3, 7,11, 15, 3); 552 &QUARTERROUND_SSSE3(0, 5,10, 15, 4); 553 &QUARTERROUND_SSSE3(1, 6,11, 12, 5); 554 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); 555 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); 556 &dec ("edx"); 557 &jnz (&label("loop")); 558 559 &movdqa (&QWP(16*4-128,"ebx"),$xb_); 560 &movdqa (&QWP(16*8-128,"ebx"),$xc); 561 &movdqa (&QWP(16*9-128,"ebx"),$xc_); 562 &movdqa (&QWP(16*12-128,"ebx"),$xd); 563 &movdqa (&QWP(16*14-128,"ebx"),$xd_); 564 565 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 566 567 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 568 &movdqa ($xa1,&QWP(16*1-128,"ebx")); 569 &movdqa ($xa2,&QWP(16*2-128,"ebx")); 570 &movdqa ($xa3,&QWP(16*3-128,"ebx")); 571 572 for($i=0;$i<256;$i+=64) { 573 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 574 &paddd ($xa1,&QWP($i+16*1-128,"ebp")); 575 &paddd ($xa2,&QWP($i+16*2-128,"ebp")); 576 &paddd ($xa3,&QWP($i+16*3-128,"ebp")); 577 578 &movdqa ($xt2,$xa0); # "de-interlace" data 579 &punpckldq ($xa0,$xa1); 580 &movdqa ($xt3,$xa2); 581 &punpckldq ($xa2,$xa3); 582 &punpckhdq ($xt2,$xa1); 583 &punpckhdq ($xt3,$xa3); 584 &movdqa ($xa1,$xa0); 585 &punpcklqdq ($xa0,$xa2); # "a0" 586 &movdqa ($xa3,$xt2); 587 &punpcklqdq ($xt2,$xt3); # "a2" 588 &punpckhqdq ($xa1,$xa2); # "a1" 589 &punpckhqdq ($xa3,$xt3); # "a3" 590 591 #($xa2,$xt2)=($xt2,$xa2); 592 593 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input 594 &movdqu ($xt1,&QWP(64*1-128,$inp)); 595 &movdqu ($xa2,&QWP(64*2-128,$inp)); 596 &movdqu ($xt3,&QWP(64*3-128,$inp)); 597 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 598 &pxor ($xt0,$xa0); 599 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 600 &pxor ($xt1,$xa1); 601 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 602 &pxor ($xt2,$xa2); 603 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 604 &pxor ($xt3,$xa3); 605 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 606 &movdqu (&QWP(64*0-128,$out),$xt0); # store output 607 &movdqu (&QWP(64*1-128,$out),$xt1); 608 &movdqu (&QWP(64*2-128,$out),$xt2); 609 &movdqu (&QWP(64*3-128,$out),$xt3); 610 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 611 } 612 &sub ($len,64*4); 613 &jnc (&label("outer_loop")); 614 615 &add ($len,64*4); 616 &jz (&label("done")); 617 618 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 619 &lea ($inp,&DWP(-128,$inp)); 620 &mov ("edx",&DWP(512+4,"esp")); 621 &lea ($out,&DWP(-128,$out)); 622 623 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 624 &movdqu ("xmm3",&QWP(0,"ebx")); 625 &paddd ("xmm2",&QWP(16*6,"eax")); # +four 626 &pand ("xmm3",&QWP(16*7,"eax")); 627 &por ("xmm3","xmm2"); # counter value 628} 629{ 630my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 631 632sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 633 &paddd ($a,$b); 634 &pxor ($d,$a); 635 &pshufb ($d,$rot16); 636 637 &paddd ($c,$d); 638 &pxor ($b,$c); 639 &movdqa ($t,$b); 640 &psrld ($b,20); 641 &pslld ($t,12); 642 &por ($b,$t); 643 644 &paddd ($a,$b); 645 &pxor ($d,$a); 646 &pshufb ($d,$rot24); 647 648 &paddd ($c,$d); 649 &pxor ($b,$c); 650 &movdqa ($t,$b); 651 &psrld ($b,25); 652 &pslld ($t,7); 653 &por ($b,$t); 654} 655 656&set_label("1x"); 657 &movdqa ($a,&QWP(16*2,"eax")); # sigma 658 &movdqu ($b,&QWP(0,"edx")); 659 &movdqu ($c,&QWP(16,"edx")); 660 #&movdqu ($d,&QWP(0,"ebx")); # already loaded 661 &movdqa ($rot16,&QWP(0,"eax")); 662 &movdqa ($rot24,&QWP(16,"eax")); 663 &mov (&DWP(16*3,"esp"),"ebp"); 664 665 &movdqa (&QWP(16*0,"esp"),$a); 666 &movdqa (&QWP(16*1,"esp"),$b); 667 &movdqa (&QWP(16*2,"esp"),$c); 668 &movdqa (&QWP(16*3,"esp"),$d); 669 &mov ("edx",10); 670 &jmp (&label("loop1x")); 671 672&set_label("outer1x",16); 673 &movdqa ($d,&QWP(16*5,"eax")); # one 674 &movdqa ($a,&QWP(16*0,"esp")); 675 &movdqa ($b,&QWP(16*1,"esp")); 676 &movdqa ($c,&QWP(16*2,"esp")); 677 &paddd ($d,&QWP(16*3,"esp")); 678 &mov ("edx",10); 679 &movdqa (&QWP(16*3,"esp"),$d); 680 &jmp (&label("loop1x")); 681 682&set_label("loop1x",16); 683 &SSSE3ROUND(); 684 &pshufd ($c,$c,0b01001110); 685 &pshufd ($b,$b,0b00111001); 686 &pshufd ($d,$d,0b10010011); 687 &nop (); 688 689 &SSSE3ROUND(); 690 &pshufd ($c,$c,0b01001110); 691 &pshufd ($b,$b,0b10010011); 692 &pshufd ($d,$d,0b00111001); 693 694 &dec ("edx"); 695 &jnz (&label("loop1x")); 696 697 &paddd ($a,&QWP(16*0,"esp")); 698 &paddd ($b,&QWP(16*1,"esp")); 699 &paddd ($c,&QWP(16*2,"esp")); 700 &paddd ($d,&QWP(16*3,"esp")); 701 702 &cmp ($len,64); 703 &jb (&label("tail")); 704 705 &movdqu ($t,&QWP(16*0,$inp)); 706 &movdqu ($t1,&QWP(16*1,$inp)); 707 &pxor ($a,$t); # xor with input 708 &movdqu ($t,&QWP(16*2,$inp)); 709 &pxor ($b,$t1); 710 &movdqu ($t1,&QWP(16*3,$inp)); 711 &pxor ($c,$t); 712 &pxor ($d,$t1); 713 &lea ($inp,&DWP(16*4,$inp)); # inp+=64 714 715 &movdqu (&QWP(16*0,$out),$a); # write output 716 &movdqu (&QWP(16*1,$out),$b); 717 &movdqu (&QWP(16*2,$out),$c); 718 &movdqu (&QWP(16*3,$out),$d); 719 &lea ($out,&DWP(16*4,$out)); # inp+=64 720 721 &sub ($len,64); 722 &jnz (&label("outer1x")); 723 724 &jmp (&label("done")); 725 726&set_label("tail"); 727 &movdqa (&QWP(16*0,"esp"),$a); 728 &movdqa (&QWP(16*1,"esp"),$b); 729 &movdqa (&QWP(16*2,"esp"),$c); 730 &movdqa (&QWP(16*3,"esp"),$d); 731 732 &xor ("eax","eax"); 733 &xor ("edx","edx"); 734 &xor ("ebp","ebp"); 735 736&set_label("tail_loop"); 737 &movb ("al",&BP(0,"esp","ebp")); 738 &movb ("dl",&BP(0,$inp,"ebp")); 739 &lea ("ebp",&DWP(1,"ebp")); 740 &xor ("al","dl"); 741 &movb (&BP(-1,$out,"ebp"),"al"); 742 &dec ($len); 743 &jnz (&label("tail_loop")); 744} 745&set_label("done"); 746 &mov ("esp",&DWP(512,"esp")); 747&function_end("ChaCha20_ssse3"); 748 749&align (64); 750&set_label("ssse3_data"); 751&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); 752&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); 753&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); 754&data_word(0,1,2,3); 755&data_word(4,4,4,4); 756&data_word(1,0,0,0); 757&data_word(4,0,0,0); 758&data_word(0,-1,-1,-1); 759&align (64); 760} 761&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 762 763&asm_finish(); 764 765close STDOUT; 766