1#!/usr/bin/env perl 2 3###################################################################### 4## Constant-time SSSE3 AES core implementation. 5## version 0.1 6## 7## By Mike Hamburg (Stanford University), 2009 8## Public domain. 9## 10## For details see http://shiftleft.org/papers/vector_aes/ and 11## http://crypto.stanford.edu/vpaes/. 12 13###################################################################### 14# September 2011. 15# 16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for 17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt 18# doesn't handle partial vectors (doesn't have to if called from 19# EVP only). "Drop-in" implies that this module doesn't share key 20# schedule structure with the original nor does it make assumption 21# about its alignment... 22# 23# Performance summary. aes-586.pl column lists large-block CBC 24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per 25# byte processed with 128-bit key, and vpaes-x86.pl column - [also 26# large-block CBC] encrypt/decrypt. 27# 28# aes-586.pl vpaes-x86.pl 29# 30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) 31# Nehalem 27.9/40.4/18.1 10.3/12.0 32# Atom 102./119./60.1 64.5/85.3(***) 33# 34# (*) "Hyper-threading" in the context refers rather to cache shared 35# among multiple cores, than to specifically Intel HTT. As vast 36# majority of contemporary cores share cache, slower code path 37# is common place. In other words "with-hyper-threading-off" 38# results are presented mostly for reference purposes. 39# 40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. 41# 42# (***) Less impressive improvement on Core 2 and Atom is due to slow 43# pshufb, yet it's respectable +32%/65% improvement on Core 2 44# and +58%/40% on Atom (as implied, over "hyper-threading-safe" 45# code path). 46# 47# <appro@openssl.org> 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50push(@INC,"${dir}","${dir}../../perlasm"); 51require "x86asm.pl"; 52 53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); 54 55$PREFIX="vpaes"; 56 57my ($round, $base, $magic, $key, $const, $inp, $out)= 58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); 59 60&static_label("_vpaes_consts"); 61&static_label("_vpaes_schedule_low_round"); 62 63&set_label("_vpaes_consts",64); 64$k_inv=-0x30; # inv, inva 65 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); 66 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); 67 68$k_s0F=-0x10; # s0F 69 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); 70 71$k_ipt=0x00; # input transform (lo, hi) 72 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); 73 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); 74 75$k_sb1=0x20; # sb1u, sb1t 76 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); 77 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); 78$k_sb2=0x40; # sb2u, sb2t 79 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); 80 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); 81$k_sbo=0x60; # sbou, sbot 82 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); 83 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); 84 85$k_mc_forward=0x80; # mc_forward 86 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); 87 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); 88 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); 89 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); 90 91$k_mc_backward=0xc0; # mc_backward 92 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); 93 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); 94 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); 95 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); 96 97$k_sr=0x100; # sr 98 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); 99 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); 100 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); 101 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); 102 103$k_rcon=0x140; # rcon 104 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); 105 106$k_s63=0x150; # s63: all equal to 0x63 transformed 107 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); 108 109$k_opt=0x160; # output transform 110 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); 111 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); 112 113$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" 114 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); 115 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); 116## 117## Decryption stuff 118## Key schedule constants 119## 120$k_dksd=0x1a0; # decryption key schedule: invskew x*D 121 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); 122 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); 123$k_dksb=0x1c0; # decryption key schedule: invskew x*B 124 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); 125 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); 126$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 127 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); 128 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); 129$k_dks9=0x200; # decryption key schedule: invskew x*9 130 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); 131 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); 132 133## 134## Decryption stuff 135## Round function constants 136## 137$k_dipt=0x220; # decryption input transform 138 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); 139 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); 140 141$k_dsb9=0x240; # decryption sbox output *9*u, *9*t 142 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); 143 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); 144$k_dsbd=0x260; # decryption sbox output *D*u, *D*t 145 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); 146 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); 147$k_dsbb=0x280; # decryption sbox output *B*u, *B*t 148 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); 149 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); 150$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t 151 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); 152 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); 153$k_dsbo=0x2c0; # decryption sbox final output 154 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); 155 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); 156&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); 157&align (64); 158 159&function_begin_B("_vpaes_preheat"); 160 &add ($const,&DWP(0,"esp")); 161 &movdqa ("xmm7",&QWP($k_inv,$const)); 162 &movdqa ("xmm6",&QWP($k_s0F,$const)); 163 &ret (); 164&function_end_B("_vpaes_preheat"); 165 166## 167## _aes_encrypt_core 168## 169## AES-encrypt %xmm0. 170## 171## Inputs: 172## %xmm0 = input 173## %xmm6-%xmm7 as in _vpaes_preheat 174## (%edx) = scheduled keys 175## 176## Output in %xmm0 177## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx 178## 179## 180&function_begin_B("_vpaes_encrypt_core"); 181 &mov ($magic,16); 182 &mov ($round,&DWP(240,$key)); 183 &movdqa ("xmm1","xmm6") 184 &movdqa ("xmm2",&QWP($k_ipt,$const)); 185 &pandn ("xmm1","xmm0"); 186 &movdqu ("xmm5",&QWP(0,$key)); 187 &psrld ("xmm1",4); 188 &pand ("xmm0","xmm6"); 189 &pshufb ("xmm2","xmm0"); 190 &movdqa ("xmm0",&QWP($k_ipt+16,$const)); 191 &pshufb ("xmm0","xmm1"); 192 &pxor ("xmm2","xmm5"); 193 &pxor ("xmm0","xmm2"); 194 &add ($key,16); 195 &lea ($base,&DWP($k_mc_backward,$const)); 196 &jmp (&label("enc_entry")); 197 198 199&set_label("enc_loop",16); 200 # middle of middle round 201 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u 202 &pshufb ("xmm4","xmm2"); # 4 = sb1u 203 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 204 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t 205 &pshufb ("xmm0","xmm3"); # 0 = sb1t 206 &pxor ("xmm0","xmm4"); # 0 = A 207 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u 208 &pshufb ("xmm5","xmm2"); # 4 = sb2u 209 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] 210 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t 211 &pshufb ("xmm2","xmm3"); # 2 = sb2t 212 &pxor ("xmm2","xmm5"); # 2 = 2A 213 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] 214 &movdqa ("xmm3","xmm0"); # 3 = A 215 &pshufb ("xmm0","xmm1"); # 0 = B 216 &add ($key,16); # next key 217 &pxor ("xmm0","xmm2"); # 0 = 2A+B 218 &pshufb ("xmm3","xmm4"); # 3 = D 219 &add ($magic,16); # next mc 220 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D 221 &pshufb ("xmm0","xmm1"); # 0 = 2B+C 222 &and ($magic,0x30); # ... mod 4 223 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D 224 &sub ($round,1); # nr-- 225 226&set_label("enc_entry"); 227 # top of round 228 &movdqa ("xmm1","xmm6"); # 1 : i 229 &pandn ("xmm1","xmm0"); # 1 = i<<4 230 &psrld ("xmm1",4); # 1 = i 231 &pand ("xmm0","xmm6"); # 0 = k 232 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k 233 &pshufb ("xmm5","xmm0"); # 2 = a/k 234 &pxor ("xmm0","xmm1"); # 0 = j 235 &movdqa ("xmm3","xmm7"); # 3 : 1/i 236 &pshufb ("xmm3","xmm1"); # 3 = 1/i 237 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k 238 &movdqa ("xmm4","xmm7"); # 4 : 1/j 239 &pshufb ("xmm4","xmm0"); # 4 = 1/j 240 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k 241 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 242 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 243 &pxor ("xmm2","xmm0"); # 2 = io 244 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 245 &movdqu ("xmm5",&QWP(0,$key)); 246 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 247 &pxor ("xmm3","xmm1"); # 3 = jo 248 &jnz (&label("enc_loop")); 249 250 # middle of last round 251 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo 252 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 253 &pshufb ("xmm4","xmm2"); # 4 = sbou 254 &pxor ("xmm4","xmm5"); # 4 = sb1u + k 255 &pshufb ("xmm0","xmm3"); # 0 = sb1t 256 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] 257 &pxor ("xmm0","xmm4"); # 0 = A 258 &pshufb ("xmm0","xmm1"); 259 &ret (); 260&function_end_B("_vpaes_encrypt_core"); 261 262## 263## Decryption core 264## 265## Same API as encryption core. 266## 267&function_begin_B("_vpaes_decrypt_core"); 268 &mov ($round,&DWP(240,$key)); 269 &lea ($base,&DWP($k_dsbd,$const)); 270 &movdqa ("xmm1","xmm6"); 271 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); 272 &pandn ("xmm1","xmm0"); 273 &mov ($magic,$round); 274 &psrld ("xmm1",4) 275 &movdqu ("xmm5",&QWP(0,$key)); 276 &shl ($magic,4); 277 &pand ("xmm0","xmm6"); 278 &pshufb ("xmm2","xmm0"); 279 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); 280 &xor ($magic,0x30); 281 &pshufb ("xmm0","xmm1"); 282 &and ($magic,0x30); 283 &pxor ("xmm2","xmm5"); 284 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); 285 &pxor ("xmm0","xmm2"); 286 &add ($key,16); 287 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); 288 &jmp (&label("dec_entry")); 289 290&set_label("dec_loop",16); 291## 292## Inverse mix columns 293## 294 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u 295 &pshufb ("xmm4","xmm2"); # 4 = sb9u 296 &pxor ("xmm4","xmm0"); 297 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t 298 &pshufb ("xmm0","xmm3"); # 0 = sb9t 299 &pxor ("xmm0","xmm4"); # 0 = ch 300 &add ($key,16); # next round key 301 302 &pshufb ("xmm0","xmm5"); # MC ch 303 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu 304 &pshufb ("xmm4","xmm2"); # 4 = sbdu 305 &pxor ("xmm4","xmm0"); # 4 = ch 306 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt 307 &pshufb ("xmm0","xmm3"); # 0 = sbdt 308 &pxor ("xmm0","xmm4"); # 0 = ch 309 &sub ($round,1); # nr-- 310 311 &pshufb ("xmm0","xmm5"); # MC ch 312 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu 313 &pshufb ("xmm4","xmm2"); # 4 = sbbu 314 &pxor ("xmm4","xmm0"); # 4 = ch 315 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt 316 &pshufb ("xmm0","xmm3"); # 0 = sbbt 317 &pxor ("xmm0","xmm4"); # 0 = ch 318 319 &pshufb ("xmm0","xmm5"); # MC ch 320 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu 321 &pshufb ("xmm4","xmm2"); # 4 = sbeu 322 &pxor ("xmm4","xmm0"); # 4 = ch 323 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet 324 &pshufb ("xmm0","xmm3"); # 0 = sbet 325 &pxor ("xmm0","xmm4"); # 0 = ch 326 327 &palignr("xmm5","xmm5",12); 328 329&set_label("dec_entry"); 330 # top of round 331 &movdqa ("xmm1","xmm6"); # 1 : i 332 &pandn ("xmm1","xmm0"); # 1 = i<<4 333 &psrld ("xmm1",4); # 1 = i 334 &pand ("xmm0","xmm6"); # 0 = k 335 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 336 &pshufb ("xmm2","xmm0"); # 2 = a/k 337 &pxor ("xmm0","xmm1"); # 0 = j 338 &movdqa ("xmm3","xmm7"); # 3 : 1/i 339 &pshufb ("xmm3","xmm1"); # 3 = 1/i 340 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 341 &movdqa ("xmm4","xmm7"); # 4 : 1/j 342 &pshufb ("xmm4","xmm0"); # 4 = 1/j 343 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 344 &movdqa ("xmm2","xmm7"); # 2 : 1/iak 345 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 346 &pxor ("xmm2","xmm0"); # 2 = io 347 &movdqa ("xmm3","xmm7"); # 3 : 1/jak 348 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 349 &pxor ("xmm3","xmm1"); # 3 = jo 350 &movdqu ("xmm0",&QWP(0,$key)); 351 &jnz (&label("dec_loop")); 352 353 # middle of last round 354 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou 355 &pshufb ("xmm4","xmm2"); # 4 = sbou 356 &pxor ("xmm4","xmm0"); # 4 = sb1u + k 357 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot 358 &movdqa ("xmm2",&QWP(0,$magic)); 359 &pshufb ("xmm0","xmm3"); # 0 = sb1t 360 &pxor ("xmm0","xmm4"); # 0 = A 361 &pshufb ("xmm0","xmm2"); 362 &ret (); 363&function_end_B("_vpaes_decrypt_core"); 364 365######################################################## 366## ## 367## AES key schedule ## 368## ## 369######################################################## 370&function_begin_B("_vpaes_schedule_core"); 371 &add ($const,&DWP(0,"esp")); 372 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) 373 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon 374 375 # input transform 376 &movdqa ("xmm3","xmm0"); 377 &lea ($base,&DWP($k_ipt,$const)); 378 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 379 &call ("_vpaes_schedule_transform"); 380 &movdqa ("xmm7","xmm0"); 381 382 &test ($out,$out); 383 &jnz (&label("schedule_am_decrypting")); 384 385 # encrypting, output zeroth round key after transform 386 &movdqu (&QWP(0,$key),"xmm0"); 387 &jmp (&label("schedule_go")); 388 389&set_label("schedule_am_decrypting"); 390 # decrypting, output zeroth round key after shiftrows 391 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 392 &pshufb ("xmm3","xmm1"); 393 &movdqu (&QWP(0,$key),"xmm3"); 394 &xor ($magic,0x30); 395 396&set_label("schedule_go"); 397 &cmp ($round,192); 398 &ja (&label("schedule_256")); 399 &je (&label("schedule_192")); 400 # 128: fall though 401 402## 403## .schedule_128 404## 405## 128-bit specific part of key schedule. 406## 407## This schedule is really simple, because all its parts 408## are accomplished by the subroutines. 409## 410&set_label("schedule_128"); 411 &mov ($round,10); 412 413&set_label("loop_schedule_128"); 414 &call ("_vpaes_schedule_round"); 415 &dec ($round); 416 &jz (&label("schedule_mangle_last")); 417 &call ("_vpaes_schedule_mangle"); # write output 418 &jmp (&label("loop_schedule_128")); 419 420## 421## .aes_schedule_192 422## 423## 192-bit specific part of key schedule. 424## 425## The main body of this schedule is the same as the 128-bit 426## schedule, but with more smearing. The long, high side is 427## stored in %xmm7 as before, and the short, low side is in 428## the high bits of %xmm6. 429## 430## This schedule is somewhat nastier, however, because each 431## round produces 192 bits of key material, or 1.5 round keys. 432## Therefore, on each cycle we do 2 rounds and produce 3 round 433## keys. 434## 435&set_label("schedule_192",16); 436 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) 437 &call ("_vpaes_schedule_transform"); # input transform 438 &movdqa ("xmm6","xmm0"); # save short part 439 &pxor ("xmm4","xmm4"); # clear 4 440 &movhlps("xmm6","xmm4"); # clobber low side with zeros 441 &mov ($round,4); 442 443&set_label("loop_schedule_192"); 444 &call ("_vpaes_schedule_round"); 445 &palignr("xmm0","xmm6",8); 446 &call ("_vpaes_schedule_mangle"); # save key n 447 &call ("_vpaes_schedule_192_smear"); 448 &call ("_vpaes_schedule_mangle"); # save key n+1 449 &call ("_vpaes_schedule_round"); 450 &dec ($round); 451 &jz (&label("schedule_mangle_last")); 452 &call ("_vpaes_schedule_mangle"); # save key n+2 453 &call ("_vpaes_schedule_192_smear"); 454 &jmp (&label("loop_schedule_192")); 455 456## 457## .aes_schedule_256 458## 459## 256-bit specific part of key schedule. 460## 461## The structure here is very similar to the 128-bit 462## schedule, but with an additional "low side" in 463## %xmm6. The low side's rounds are the same as the 464## high side's, except no rcon and no rotation. 465## 466&set_label("schedule_256",16); 467 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) 468 &call ("_vpaes_schedule_transform"); # input transform 469 &mov ($round,7); 470 471&set_label("loop_schedule_256"); 472 &call ("_vpaes_schedule_mangle"); # output low result 473 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 474 475 # high round 476 &call ("_vpaes_schedule_round"); 477 &dec ($round); 478 &jz (&label("schedule_mangle_last")); 479 &call ("_vpaes_schedule_mangle"); 480 481 # low round. swap xmm7 and xmm6 482 &pshufd ("xmm0","xmm0",0xFF); 483 &movdqa (&QWP(20,"esp"),"xmm7"); 484 &movdqa ("xmm7","xmm6"); 485 &call ("_vpaes_schedule_low_round"); 486 &movdqa ("xmm7",&QWP(20,"esp")); 487 488 &jmp (&label("loop_schedule_256")); 489 490## 491## .aes_schedule_mangle_last 492## 493## Mangler for last round of key schedule 494## Mangles %xmm0 495## when encrypting, outputs out(%xmm0) ^ 63 496## when decrypting, outputs unskew(%xmm0) 497## 498## Always called right before return... jumps to cleanup and exits 499## 500&set_label("schedule_mangle_last",16); 501 # schedule last round key from xmm0 502 &lea ($base,&DWP($k_deskew,$const)); 503 &test ($out,$out); 504 &jnz (&label("schedule_mangle_last_dec")); 505 506 # encrypting 507 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 508 &pshufb ("xmm0","xmm1"); # output permute 509 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform 510 &add ($key,32); 511 512&set_label("schedule_mangle_last_dec"); 513 &add ($key,-16); 514 &pxor ("xmm0",&QWP($k_s63,$const)); 515 &call ("_vpaes_schedule_transform"); # output transform 516 &movdqu (&QWP(0,$key),"xmm0"); # save last key 517 518 # cleanup 519 &pxor ("xmm0","xmm0"); 520 &pxor ("xmm1","xmm1"); 521 &pxor ("xmm2","xmm2"); 522 &pxor ("xmm3","xmm3"); 523 &pxor ("xmm4","xmm4"); 524 &pxor ("xmm5","xmm5"); 525 &pxor ("xmm6","xmm6"); 526 &pxor ("xmm7","xmm7"); 527 &ret (); 528&function_end_B("_vpaes_schedule_core"); 529 530## 531## .aes_schedule_192_smear 532## 533## Smear the short, low side in the 192-bit key schedule. 534## 535## Inputs: 536## %xmm7: high side, b a x y 537## %xmm6: low side, d c 0 0 538## %xmm13: 0 539## 540## Outputs: 541## %xmm6: b+c+d b+c 0 0 542## %xmm0: b+c+d b+c b a 543## 544&function_begin_B("_vpaes_schedule_192_smear"); 545 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 546 &pxor ("xmm6","xmm0"); # -> c+d c 0 0 547 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a 548 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a 549 &movdqa ("xmm0","xmm6"); 550 &pxor ("xmm1","xmm1"); 551 &movhlps("xmm6","xmm1"); # clobber low side with zeros 552 &ret (); 553&function_end_B("_vpaes_schedule_192_smear"); 554 555## 556## .aes_schedule_round 557## 558## Runs one main round of the key schedule on %xmm0, %xmm7 559## 560## Specifically, runs subbytes on the high dword of %xmm0 561## then rotates it by one byte and xors into the low dword of 562## %xmm7. 563## 564## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 565## next rcon. 566## 567## Smears the dwords of %xmm7 by xoring the low into the 568## second low, result into third, result into highest. 569## 570## Returns results in %xmm7 = %xmm0. 571## Clobbers %xmm1-%xmm5. 572## 573&function_begin_B("_vpaes_schedule_round"); 574 # extract rcon from xmm8 575 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 576 &pxor ("xmm1","xmm1"); 577 &palignr("xmm1","xmm2",15); 578 &palignr("xmm2","xmm2",15); 579 &pxor ("xmm7","xmm1"); 580 581 # rotate 582 &pshufd ("xmm0","xmm0",0xFF); 583 &palignr("xmm0","xmm0",1); 584 585 # fall through... 586 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 587 588 # low round: same as high round, but no rotation and no rcon. 589&set_label("_vpaes_schedule_low_round"); 590 # smear xmm7 591 &movdqa ("xmm1","xmm7"); 592 &pslldq ("xmm7",4); 593 &pxor ("xmm7","xmm1"); 594 &movdqa ("xmm1","xmm7"); 595 &pslldq ("xmm7",8); 596 &pxor ("xmm7","xmm1"); 597 &pxor ("xmm7",&QWP($k_s63,$const)); 598 599 # subbyte 600 &movdqa ("xmm4",&QWP($k_s0F,$const)); 601 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j 602 &movdqa ("xmm1","xmm4"); 603 &pandn ("xmm1","xmm0"); 604 &psrld ("xmm1",4); # 1 = i 605 &pand ("xmm0","xmm4"); # 0 = k 606 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k 607 &pshufb ("xmm2","xmm0"); # 2 = a/k 608 &pxor ("xmm0","xmm1"); # 0 = j 609 &movdqa ("xmm3","xmm5"); # 3 : 1/i 610 &pshufb ("xmm3","xmm1"); # 3 = 1/i 611 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k 612 &movdqa ("xmm4","xmm5"); # 4 : 1/j 613 &pshufb ("xmm4","xmm0"); # 4 = 1/j 614 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k 615 &movdqa ("xmm2","xmm5"); # 2 : 1/iak 616 &pshufb ("xmm2","xmm3"); # 2 = 1/iak 617 &pxor ("xmm2","xmm0"); # 2 = io 618 &movdqa ("xmm3","xmm5"); # 3 : 1/jak 619 &pshufb ("xmm3","xmm4"); # 3 = 1/jak 620 &pxor ("xmm3","xmm1"); # 3 = jo 621 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou 622 &pshufb ("xmm4","xmm2"); # 4 = sbou 623 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot 624 &pshufb ("xmm0","xmm3"); # 0 = sb1t 625 &pxor ("xmm0","xmm4"); # 0 = sbox output 626 627 # add in smeared stuff 628 &pxor ("xmm0","xmm7"); 629 &movdqa ("xmm7","xmm0"); 630 &ret (); 631&function_end_B("_vpaes_schedule_round"); 632 633## 634## .aes_schedule_transform 635## 636## Linear-transform %xmm0 according to tables at (%ebx) 637## 638## Output in %xmm0 639## Clobbers %xmm1, %xmm2 640## 641&function_begin_B("_vpaes_schedule_transform"); 642 &movdqa ("xmm2",&QWP($k_s0F,$const)); 643 &movdqa ("xmm1","xmm2"); 644 &pandn ("xmm1","xmm0"); 645 &psrld ("xmm1",4); 646 &pand ("xmm0","xmm2"); 647 &movdqa ("xmm2",&QWP(0,$base)); 648 &pshufb ("xmm2","xmm0"); 649 &movdqa ("xmm0",&QWP(16,$base)); 650 &pshufb ("xmm0","xmm1"); 651 &pxor ("xmm0","xmm2"); 652 &ret (); 653&function_end_B("_vpaes_schedule_transform"); 654 655## 656## .aes_schedule_mangle 657## 658## Mangle xmm0 from (basis-transformed) standard version 659## to our version. 660## 661## On encrypt, 662## xor with 0x63 663## multiply by circulant 0,1,1,1 664## apply shiftrows transform 665## 666## On decrypt, 667## xor with 0x63 668## multiply by "inverse mixcolumns" circulant E,B,D,9 669## deskew 670## apply shiftrows transform 671## 672## 673## Writes out to (%edx), and increments or decrements it 674## Keeps track of round number mod 4 in %ecx 675## Preserves xmm0 676## Clobbers xmm1-xmm5 677## 678&function_begin_B("_vpaes_schedule_mangle"); 679 &movdqa ("xmm4","xmm0"); # save xmm0 for later 680 &movdqa ("xmm5",&QWP($k_mc_forward,$const)); 681 &test ($out,$out); 682 &jnz (&label("schedule_mangle_dec")); 683 684 # encrypting 685 &add ($key,16); 686 &pxor ("xmm4",&QWP($k_s63,$const)); 687 &pshufb ("xmm4","xmm5"); 688 &movdqa ("xmm3","xmm4"); 689 &pshufb ("xmm4","xmm5"); 690 &pxor ("xmm3","xmm4"); 691 &pshufb ("xmm4","xmm5"); 692 &pxor ("xmm3","xmm4"); 693 694 &jmp (&label("schedule_mangle_both")); 695 696&set_label("schedule_mangle_dec",16); 697 # inverse mix columns 698 &movdqa ("xmm2",&QWP($k_s0F,$const)); 699 &lea ($inp,&DWP($k_dksd,$const)); 700 &movdqa ("xmm1","xmm2"); 701 &pandn ("xmm1","xmm4"); 702 &psrld ("xmm1",4); # 1 = hi 703 &pand ("xmm4","xmm2"); # 4 = lo 704 705 &movdqa ("xmm2",&QWP(0,$inp)); 706 &pshufb ("xmm2","xmm4"); 707 &movdqa ("xmm3",&QWP(0x10,$inp)); 708 &pshufb ("xmm3","xmm1"); 709 &pxor ("xmm3","xmm2"); 710 &pshufb ("xmm3","xmm5"); 711 712 &movdqa ("xmm2",&QWP(0x20,$inp)); 713 &pshufb ("xmm2","xmm4"); 714 &pxor ("xmm2","xmm3"); 715 &movdqa ("xmm3",&QWP(0x30,$inp)); 716 &pshufb ("xmm3","xmm1"); 717 &pxor ("xmm3","xmm2"); 718 &pshufb ("xmm3","xmm5"); 719 720 &movdqa ("xmm2",&QWP(0x40,$inp)); 721 &pshufb ("xmm2","xmm4"); 722 &pxor ("xmm2","xmm3"); 723 &movdqa ("xmm3",&QWP(0x50,$inp)); 724 &pshufb ("xmm3","xmm1"); 725 &pxor ("xmm3","xmm2"); 726 &pshufb ("xmm3","xmm5"); 727 728 &movdqa ("xmm2",&QWP(0x60,$inp)); 729 &pshufb ("xmm2","xmm4"); 730 &pxor ("xmm2","xmm3"); 731 &movdqa ("xmm3",&QWP(0x70,$inp)); 732 &pshufb ("xmm3","xmm1"); 733 &pxor ("xmm3","xmm2"); 734 735 &add ($key,-16); 736 737&set_label("schedule_mangle_both"); 738 &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); 739 &pshufb ("xmm3","xmm1"); 740 &add ($magic,-16); 741 &and ($magic,0x30); 742 &movdqu (&QWP(0,$key),"xmm3"); 743 &ret (); 744&function_end_B("_vpaes_schedule_mangle"); 745 746# 747# Interface to OpenSSL 748# 749&function_begin("${PREFIX}_set_encrypt_key"); 750 &mov ($inp,&wparam(0)); # inp 751 &lea ($base,&DWP(-56,"esp")); 752 &mov ($round,&wparam(1)); # bits 753 &and ($base,-16); 754 &mov ($key,&wparam(2)); # key 755 &xchg ($base,"esp"); # alloca 756 &mov (&DWP(48,"esp"),$base); 757 758 &mov ($base,$round); 759 &shr ($base,5); 760 &add ($base,5); 761 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 762 &mov ($magic,0x30); 763 &mov ($out,0); 764 765 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 766 &call ("_vpaes_schedule_core"); 767&set_label("pic_point"); 768 769 &mov ("esp",&DWP(48,"esp")); 770 &xor ("eax","eax"); 771&function_end("${PREFIX}_set_encrypt_key"); 772 773&function_begin("${PREFIX}_set_decrypt_key"); 774 &mov ($inp,&wparam(0)); # inp 775 &lea ($base,&DWP(-56,"esp")); 776 &mov ($round,&wparam(1)); # bits 777 &and ($base,-16); 778 &mov ($key,&wparam(2)); # key 779 &xchg ($base,"esp"); # alloca 780 &mov (&DWP(48,"esp"),$base); 781 782 &mov ($base,$round); 783 &shr ($base,5); 784 &add ($base,5); 785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; 786 &shl ($base,4); 787 &lea ($key,&DWP(16,$key,$base)); 788 789 &mov ($out,1); 790 &mov ($magic,$round); 791 &shr ($magic,1); 792 &and ($magic,32); 793 &xor ($magic,32); # nbist==192?0:32; 794 795 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 796 &call ("_vpaes_schedule_core"); 797&set_label("pic_point"); 798 799 &mov ("esp",&DWP(48,"esp")); 800 &xor ("eax","eax"); 801&function_end("${PREFIX}_set_decrypt_key"); 802 803&function_begin("${PREFIX}_encrypt"); 804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 805 &call ("_vpaes_preheat"); 806&set_label("pic_point"); 807 &mov ($inp,&wparam(0)); # inp 808 &lea ($base,&DWP(-56,"esp")); 809 &mov ($out,&wparam(1)); # out 810 &and ($base,-16); 811 &mov ($key,&wparam(2)); # key 812 &xchg ($base,"esp"); # alloca 813 &mov (&DWP(48,"esp"),$base); 814 815 &movdqu ("xmm0",&QWP(0,$inp)); 816 &call ("_vpaes_encrypt_core"); 817 &movdqu (&QWP(0,$out),"xmm0"); 818 819 &mov ("esp",&DWP(48,"esp")); 820&function_end("${PREFIX}_encrypt"); 821 822&function_begin("${PREFIX}_decrypt"); 823 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 824 &call ("_vpaes_preheat"); 825&set_label("pic_point"); 826 &mov ($inp,&wparam(0)); # inp 827 &lea ($base,&DWP(-56,"esp")); 828 &mov ($out,&wparam(1)); # out 829 &and ($base,-16); 830 &mov ($key,&wparam(2)); # key 831 &xchg ($base,"esp"); # alloca 832 &mov (&DWP(48,"esp"),$base); 833 834 &movdqu ("xmm0",&QWP(0,$inp)); 835 &call ("_vpaes_decrypt_core"); 836 &movdqu (&QWP(0,$out),"xmm0"); 837 838 &mov ("esp",&DWP(48,"esp")); 839&function_end("${PREFIX}_decrypt"); 840 841&function_begin("${PREFIX}_cbc_encrypt"); 842 &mov ($inp,&wparam(0)); # inp 843 &mov ($out,&wparam(1)); # out 844 &mov ($round,&wparam(2)); # len 845 &mov ($key,&wparam(3)); # key 846 &sub ($round,16); 847 &jc (&label("cbc_abort")); 848 &lea ($base,&DWP(-56,"esp")); 849 &mov ($const,&wparam(4)); # ivp 850 &and ($base,-16); 851 &mov ($magic,&wparam(5)); # enc 852 &xchg ($base,"esp"); # alloca 853 &movdqu ("xmm1",&QWP(0,$const)); # load IV 854 &sub ($out,$inp); 855 &mov (&DWP(48,"esp"),$base); 856 857 &mov (&DWP(0,"esp"),$out); # save out 858 &mov (&DWP(4,"esp"),$key) # save key 859 &mov (&DWP(8,"esp"),$const); # save ivp 860 &mov ($out,$round); # $out works as $len 861 862 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); 863 &call ("_vpaes_preheat"); 864&set_label("pic_point"); 865 &cmp ($magic,0); 866 &je (&label("cbc_dec_loop")); 867 &jmp (&label("cbc_enc_loop")); 868 869&set_label("cbc_enc_loop",16); 870 &movdqu ("xmm0",&QWP(0,$inp)); # load input 871 &pxor ("xmm0","xmm1"); # inp^=iv 872 &call ("_vpaes_encrypt_core"); 873 &mov ($base,&DWP(0,"esp")); # restore out 874 &mov ($key,&DWP(4,"esp")); # restore key 875 &movdqa ("xmm1","xmm0"); 876 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 877 &lea ($inp,&DWP(16,$inp)); 878 &sub ($out,16); 879 &jnc (&label("cbc_enc_loop")); 880 &jmp (&label("cbc_done")); 881 882&set_label("cbc_dec_loop",16); 883 &movdqu ("xmm0",&QWP(0,$inp)); # load input 884 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV 885 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV 886 &call ("_vpaes_decrypt_core"); 887 &mov ($base,&DWP(0,"esp")); # restore out 888 &mov ($key,&DWP(4,"esp")); # restore key 889 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv 890 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV 891 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output 892 &lea ($inp,&DWP(16,$inp)); 893 &sub ($out,16); 894 &jnc (&label("cbc_dec_loop")); 895 896&set_label("cbc_done"); 897 &mov ($base,&DWP(8,"esp")); # restore ivp 898 &mov ("esp",&DWP(48,"esp")); 899 &movdqu (&QWP(0,$base),"xmm1"); # write IV 900&set_label("cbc_abort"); 901&function_end("${PREFIX}_cbc_encrypt"); 902 903&asm_finish(); 904