1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15#include "hitls_build.h" 16#if defined(HITLS_CRYPTO_AES) && defined(HITLS_CRYPTO_XTS) 17 18#include "crypt_aes_macro_x86_64.s" 19 20.file "crypt_aes_xts_x86_64.S" 21 22.set KEY, %rdi 23.set IN, %rsi 24.set OUT, %rdx 25.set LEN, %ecx 26.set TWEAK, %r8 27 28.set KTMP, %r9 29.set LTMP, %r15d 30.set TAILNUM,%r14d 31.set TMPOUT,%r13 32.set TMPIN,%r9 33 34.set ROUNDS, %eax 35.set RET, %eax 36.set TROUNDS, %r10 37.set ROUNDSQ,%rax 38.set KEYEND,%r9 39 40.set WTMP0, %ecx 41.set WTMP1, %r10d 42.set WTMP2, %r11d 43 44.set XTMP0, %rcx 45.set XTMP1, %r10 46.set XTMP2, %r11 47 48.set TWX0, %r13 49.set TWX1, %r14 50 51.set BLK0, %xmm8 52.set BLK1, %xmm9 53.set BLK2, %xmm10 54.set BLK3, %xmm11 55.set BLK4, %xmm12 56.set BLK5, %xmm13 57.set BLK6, %xmm14 58 59.set TWEAK0, %xmm0 60.set TWEAK1, %xmm1 61.set TWEAK2, %xmm2 62.set TWEAK3, %xmm3 63.set TWEAK4, %xmm4 64.set TWEAK5, %xmm5 65.set TWEAK6, %xmm6 66 67.set RDK, %xmm15 68.set RDK1, %xmm7 69.set TMPX, %xmm7 70.set GFP, %xmm6 71.set TWKTMP, %xmm14 72 73 74.macro NextTweakCore gfp, twkin, twktmp, tmp 75 vmovdqa \twktmp,\tmp 76 vpaddd \twktmp,\twktmp,\twktmp // doubleword << 1 77 vpsrad $31,\tmp,\tmp // ASR doubleword 78 vpaddq \twkin,\twkin,\twkin // quadword << 1 79 vpand \gfp,\tmp,\tmp // and 0x10000000000000087 80 vpxor \tmp,\twkin,\twkin 81.endm 82 83.macro NextTweak gfp, twkin, twkout, twktmp, tmp 84 NextTweakCore \gfp,\twkin,\twktmp,\tmp 85 vmovdqa \twkin,\twkout 86.endm 87 88.macro SAVE_STACK 89 push %rbx 90 push %rbp 91 push %rsp 92 push %r12 93 push %r13 94 push %r14 95 push %r15 96.endm 97 98.macro LOAD_STACK 99 pop %r15 100 pop %r14 101 pop %r13 102 pop %r12 103 pop %rsp 104 pop %rbp 105 pop %rbx 106.endm 107 108.data 109.align 64 110// modulus of Galois Field x^128+x^7+x^2+x+1 => 0x87(0b10000111) 111.Lgfp128: 112.long 0x87,0,1,0 113 114.text 115 116/** 117 * Function description: Sets the AES encryption assembly acceleration API in XTS mode. 118 * Function prototype: int32_t CRYPT_AES_XTS_Encrypt(const CRYPT_AES_Key *ctx, 119 * const uint8_t *in, uint8_t *out, uint32_t len); 120 * Input register: 121 * x0: Pointer to the input key structure. 122 * x1: Points to the 128-bit input data. 123 * x2: Points to the 128-bit output data. 124 * x3: Indicates the length of a data block, that is, 16 bytes. 125 * Change register: xmm1,xmm3,xmm4,xmm5,xmm6,xmm10,xmm11,xmm12,xmm13. 126 * Output register: eax. 127 * Function/Macro Call: None. 128 */ 129.align 32 130.globl CRYPT_AES_XTS_Encrypt 131.type CRYPT_AES_XTS_Encrypt, @function 132CRYPT_AES_XTS_Encrypt: 133.cfi_startproc 134 pushq %rbx 135 pushq %rbp 136 pushq %r12 137 pushq %r13 138 pushq %r14 139 pushq %r15 140 sub $96,%rsp 141 mov %rsp,%rbp 142 and $-16,%rsp // 16 bytes align 143 144 movl LEN, LTMP 145 movl LEN, TAILNUM 146 andl $-16,LTMP 147 andl $0xf,TAILNUM // LEN % 16 148 movl 240(KEY), ROUNDS 149 vmovdqa .Lgfp128(%rip),GFP 150 vmovdqu (TWEAK), TWEAK0 151 shl $4,ROUNDS // roundkey size: rounds*16, except for the last one 152 lea 16(KEY, ROUNDSQ),KEYEND // step to the end of roundkeys 153 154.Lxts_aesenc_start: 155 cmpl $64, LTMP 156 jae .Lxts_enc_above_equal_4_blks 157 cmpl $32, LTMP 158 jae .Lxts_enc_above_equal_2_blks 159 cmpl $0, LTMP 160 je .Lxts_aesenc_finish 161 jmp .Lxts_enc_proc_1_blk 162 163.Lxts_enc_above_equal_2_blks: 164 cmpl $48, LTMP 165 jb .Lxts_enc_proc_2_blks 166 jmp .Lxts_enc_proc_3_blks 167 168.Lxts_enc_above_equal_4_blks: 169 cmpl $96, LTMP 170 jae .Lxts_enc_proc_6_blks_pre 171 cmpl $80, LTMP 172 jb .Lxts_enc_proc_4_blks 173 jmp .Lxts_enc_proc_5_blks 174 175.align 16 176.Lxts_enc_proc_1_blk: 177 vmovdqu (IN),BLK0 178.Lxts_enc_proc_1blk_loaded: 179 mov KEY,KTMP 180 vpshufd $0x5f,TWEAK0,TWKTMP 181 vmovdqa TWEAK0,TWEAK5 182 movl 240(KTMP), ROUNDS 183 vmovdqu (KTMP), RDK 184 vpxor RDK,BLK0,BLK0 185 decl ROUNDS 186 vpxor TWEAK0, BLK0, BLK0 187 AES_ENC_1_BLK KTMP ROUNDS RDK BLK0 188 vpxor TWEAK0, BLK0, BLK0 189 vmovdqu BLK0, (OUT) 190 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 191 lea 16(IN),IN 192 subl $16,LTMP 193 lea 16(OUT),OUT 194 195 je .Lxts_aesenc_finish 196 197.align 16 198.Lxts_enc_proc_2_blks: 199 mov KEY,KTMP 200 vpshufd $0x5f,TWEAK0,TWKTMP 201 vmovdqa TWEAK0,TWEAK5 202 movl 240(KTMP), ROUNDS 203 vmovdqu (KTMP), RDK 204 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 205 vpxor (IN), RDK, BLK0 206 vpxor 16(IN), RDK, BLK1 207 decl ROUNDS 208 vpxor TWEAK0, BLK0, BLK0 209 vpxor TWEAK1, BLK1, BLK1 210 AES_ENC_2_BLKS KTMP ROUNDS RDK BLK0 BLK1 211 vpxor TWEAK0, BLK0, BLK0 212 vpxor TWEAK1, BLK1, BLK1 213 vmovdqu BLK0, (OUT) 214 vmovdqu BLK1, 16(OUT) 215 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 216 lea 32(IN),IN 217 subl $32,LTMP 218 lea 32(OUT),OUT 219 220 je .Lxts_aesenc_finish 221 222.align 16 223.Lxts_enc_proc_3_blks: 224 mov KEY,KTMP 225 vpshufd $0x5f,TWEAK0,TWKTMP 226 vmovdqa TWEAK0,TWEAK5 227 movl 240(KTMP), ROUNDS 228 vmovdqu (KTMP), RDK 229 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 230 vpxor (IN), RDK, BLK0 231 vpxor 16(IN), RDK, BLK1 232 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 233 vpxor 32(IN), RDK, BLK2 234 decl ROUNDS 235 vpxor TWEAK0, BLK0, BLK0 236 vpxor TWEAK1, BLK1, BLK1 237 vpxor TWEAK2, BLK2, BLK2 238 AES_ENC_3_BLKS KTMP ROUNDS RDK BLK0 BLK1 BLK2 239 vpxor TWEAK0, BLK0, BLK0 240 vpxor TWEAK1, BLK1, BLK1 241 vpxor TWEAK2, BLK2, BLK2 242 vmovdqu BLK0, (OUT) 243 vmovdqu BLK1, 16(OUT) 244 vmovdqu BLK2, 32(OUT) 245 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 246 lea 48(IN),IN 247 subl $48,LTMP 248 lea 48(OUT),OUT 249 je .Lxts_aesenc_finish 250 251.align 16 252.Lxts_enc_proc_4_blks: 253 mov KEY,KTMP 254 vpshufd $0x5f,TWEAK0,TWKTMP 255 vmovdqa TWEAK0,TWEAK5 256 movl 240(KTMP), ROUNDS 257 vmovdqu (KTMP), RDK 258 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 259 vpxor (IN), RDK, BLK0 260 vpxor 16(IN), RDK, BLK1 261 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 262 vpxor 32(IN), RDK, BLK2 263 NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX 264 vpxor 48(IN), RDK, BLK3 265 decl ROUNDS 266 vpxor TWEAK0, BLK0, BLK0 267 vpxor TWEAK1, BLK1, BLK1 268 vpxor TWEAK2, BLK2, BLK2 269 vpxor TWEAK3, BLK3, BLK3 270 AES_ENC_4_BLKS KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3 271 vpxor TWEAK0, BLK0, BLK0 272 vpxor TWEAK1, BLK1, BLK1 273 vpxor TWEAK2, BLK2, BLK2 274 vpxor TWEAK3, BLK3, BLK3 275 vmovdqu BLK0, (OUT) 276 vmovdqu BLK1, 16(OUT) 277 vmovdqu BLK2, 32(OUT) 278 vmovdqu BLK3, 48(OUT) 279 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 280 lea 64(IN),IN 281 subl $64,LTMP 282 lea 64(OUT),OUT 283 je .Lxts_aesenc_finish 284 285.align 16 286.Lxts_enc_proc_5_blks: 287 mov KEY,KTMP 288 vpshufd $0x5f,TWEAK0,TWKTMP 289 vmovdqa TWEAK0,TWEAK5 290 movl 240(KTMP), ROUNDS 291 vmovdqu (KTMP), RDK 292 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 293 vpxor (IN), RDK, BLK0 294 vpxor 16(IN), RDK, BLK1 295 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 296 vpxor 32(IN), RDK, BLK2 297 NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX 298 vpxor 48(IN), RDK, BLK3 299 NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX 300 vpxor 64(IN), RDK, BLK4 301 decl ROUNDS 302 vpxor TWEAK0, BLK0, BLK0 303 vpxor TWEAK1, BLK1, BLK1 304 vpxor TWEAK2, BLK2, BLK2 305 vpxor TWEAK3, BLK3, BLK3 306 vpxor TWEAK4, BLK4, BLK4 307 AES_ENC_5_BLKS KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4 308 vpxor TWEAK0, BLK0, BLK0 309 vpxor TWEAK1, BLK1, BLK1 310 vpxor TWEAK2, BLK2, BLK2 311 vpxor TWEAK3, BLK3, BLK3 312 vpxor TWEAK4, BLK4, BLK4 313 vmovdqu BLK0, (OUT) 314 vmovdqu BLK1, 16(OUT) 315 vmovdqu BLK2, 32(OUT) 316 vmovdqu BLK3, 48(OUT) 317 vmovdqu BLK4, 64(OUT) 318 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 319 lea 80(IN),IN 320 subl $80,LTMP 321 lea 80(OUT),OUT 322 je .Lxts_aesenc_finish 323 324.align 16 325.Lxts_enc_proc_6_blks_pre: 326 vpshufd $0x5f,TWEAK0,TWKTMP // save higher doubleword of tweak 327 vmovdqa TWEAK0,TWEAK5 // copy first tweak 328 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 329 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 330 NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX 331 NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX 332 NextTweakCore GFP, TWEAK5, TWKTMP, TMPX 333 334.Lxts_enc_proc_6_blks: 335 vmovdqu (KEY), RDK 336 vmovdqu (IN),BLK0 337 vpxor TWEAK0,BLK0,BLK0 // blk0 ^= tweak0 338 vpxor RDK,BLK0,BLK0 // blk0 = blk0 ^ tweak0 ^ rk0, prepared for the loop round 339 vmovdqu -16(KEYEND),RDK1 // load last round key 340 341 vmovdqu 16(IN),BLK1 342 vpxor RDK1,TWEAK0,TWEAK0 343 aesenc 16(KEY),BLK0 // first round: rk1 344 vmovdqa TWEAK0,(%rsp) 345 vpxor TWEAK1,BLK1,BLK1 346 vpxor RDK,BLK1,BLK1 347 348 vmovdqu 32(IN),BLK2 349 vpxor RDK1,TWEAK1,TWEAK1 350 aesenc 16(KEY),BLK1 351 vmovdqa TWEAK1,16(%rsp) 352 vpxor TWEAK2,BLK2,BLK2 353 vpxor RDK,BLK2,BLK2 354 355 vmovdqu 48(IN),BLK3 356 vpxor RDK1,TWEAK2,TWEAK2 357 aesenc 16(KEY),BLK2 358 vmovdqa TWEAK2,32(%rsp) 359 vpxor TWEAK3,BLK3,BLK3 360 vpxor RDK,BLK3,BLK3 361 362 vmovdqu 64(IN),BLK4 363 vpxor RDK1,TWEAK3,TWEAK3 364 aesenc 16(KEY),BLK3 365 vmovdqa TWEAK3,48(%rsp) 366 vpxor TWEAK4,BLK4,BLK4 367 vpxor RDK,BLK4,BLK4 368 369 vmovdqu 80(IN),BLK5 370 vpxor RDK1,TWEAK4,TWEAK4 371 aesenc 16(KEY),BLK4 372 vmovdqa TWEAK4,64(%rsp) 373 vpxor TWEAK5,BLK5,BLK5 374 vpxor RDK,BLK5,BLK5 375 vpxor RDK1,TWEAK5,TWEAK5 376 aesenc 16(KEY),BLK5 377 vmovdqa TWEAK5,80(%rsp) 378 379 mov $(7*16),TROUNDS // loop 7 rounds 380 sub ROUNDSQ,TROUNDS 381.align 16 382.Lxts_6_blks_loop: 383 vmovdqu -96(KEYEND,TROUNDS),RDK // left 5+1 block to interval 384 aesenc RDK, BLK0 385 aesenc RDK, BLK1 386 aesenc RDK, BLK2 387 add $16,TROUNDS 388 aesenc RDK, BLK3 389 aesenc RDK, BLK4 390 aesenc RDK, BLK5 391 jnz .Lxts_6_blks_loop 392 393 vpxor 80(%rsp),RDK1,TWEAK5 // tweak5 = tweak5^lastroundkey^lastroundkey 394 vmovdqu -96(KEYEND,TROUNDS),RDK 395 vpshufd $0x5f,TWEAK5,TWKTMP // use new tweak-tmp 396 vmovdqa TWKTMP,TMPX // pre-calculate next round tweak0~tweak5 397 aesenc RDK, BLK0 398 vpaddd TWKTMP,TWKTMP,TWKTMP 399 vpsrad $31,TMPX,TMPX 400 aesenc RDK, BLK1 401 vpaddq TWEAK5,TWEAK5,TWEAK5 402 vpand GFP,TMPX,TMPX 403 aesenc RDK, BLK2 404 vpxor TMPX,TWEAK5,TWEAK5 405 add $16,TROUNDS 406 aesenc RDK, BLK3 407 vmovdqa TWEAK5,TWEAK0 408 aesenc RDK, BLK4 409 aesenc RDK, BLK5 410 411 vmovdqu -96(KEYEND,TROUNDS),RDK 412 vmovdqa TWKTMP,TMPX 413 aesenc RDK, BLK0 414 vpaddd TWKTMP,TWKTMP,TWKTMP 415 vpsrad $31,TMPX,TMPX 416 aesenc RDK, BLK1 417 vpaddq TWEAK5,TWEAK5,TWEAK5 418 vpand GFP,TMPX,TMPX 419 aesenc RDK, BLK2 420 vpxor TMPX,TWEAK5,TWEAK5 421 add $16,TROUNDS 422 aesenc RDK, BLK3 423 vmovdqa TWEAK5,TWEAK1 424 aesenc RDK, BLK4 425 aesenc RDK, BLK5 426 427 vmovdqu -96(KEYEND,TROUNDS),RDK 428 vmovdqa TWKTMP,TMPX 429 aesenc RDK, BLK0 430 vpaddd TWKTMP,TWKTMP,TWKTMP 431 vpsrad $31,TMPX,TMPX 432 aesenc RDK, BLK1 433 vpaddq TWEAK5,TWEAK5,TWEAK5 434 vpand GFP,TMPX,TMPX 435 aesenc RDK, BLK2 436 vpxor TMPX,TWEAK5,TWEAK5 437 add $16,TROUNDS 438 aesenc RDK, BLK3 439 vmovdqa TWEAK5,TWEAK2 440 aesenc RDK, BLK4 441 aesenc RDK, BLK5 442 443 vmovdqu -96(KEYEND,TROUNDS),RDK 444 vmovdqa TWKTMP,TMPX 445 aesenc RDK, BLK0 446 vpaddd TWKTMP,TWKTMP,TWKTMP 447 vpsrad $31,TMPX,TMPX 448 aesenc RDK, BLK1 449 vpaddq TWEAK5,TWEAK5,TWEAK5 450 vpand GFP,TMPX,TMPX 451 aesenc RDK, BLK2 452 vpxor TMPX,TWEAK5,TWEAK5 453 add $16,TROUNDS 454 aesenc RDK, BLK3 455 vmovdqa TWEAK5,TWEAK3 456 aesenc RDK, BLK4 457 aesenc RDK, BLK5 458 459 vmovdqu -96(KEYEND,TROUNDS),RDK 460 vmovdqa TWKTMP,TMPX 461 aesenc RDK, BLK0 462 vpaddd TWKTMP,TWKTMP,TWKTMP 463 vpsrad $31,TMPX,TMPX 464 aesenc RDK, BLK1 465 vpaddq TWEAK5,TWEAK5,TWEAK5 466 vpand GFP,TMPX,TMPX 467 aesenc RDK, BLK2 468 vpxor TMPX,TWEAK5,TWEAK5 469 aesenc RDK, BLK3 470 vmovdqa TWEAK5,TWEAK4 471 aesenc RDK, BLK4 472 aesenc RDK, BLK5 473 474 vmovdqa TWKTMP,TMPX 475 aesenclast (%rsp), BLK0 476 aesenclast 16(%rsp), BLK1 // already do the tweak^lastround, so here just aesenclast 477 vpaddd TWKTMP,TWKTMP,TWKTMP 478 vpsrad $31,TMPX,TMPX 479 aesenclast 32(%rsp), BLK2 480 vpaddq TWEAK5,TWEAK5,TWEAK5 481 vpand GFP,TMPX,TMPX 482 aesenclast 48(%rsp), BLK3 483 vpxor TMPX,TWEAK5,TWEAK5 484 aesenclast 64(%rsp), BLK4 485 aesenclast 80(%rsp), BLK5 486 487 vmovdqu BLK0, (OUT) 488 vmovdqu BLK1, 16(OUT) 489 vmovdqu BLK2, 32(OUT) 490 vmovdqu BLK3, 48(OUT) 491 vmovdqu BLK4, 64(OUT) 492 vmovdqu BLK5, 80(OUT) 493 494 leaq 96(IN), IN 495 leaq 96(OUT), OUT 496 sub $96, LTMP 497 cmp $96, LTMP 498 jb .Lxts_aesenc_start 499 jmp .Lxts_enc_proc_6_blks 500 501.align 16 502.Lxts_aesenc_finish: 503 cmp $0,TAILNUM 504 je .Lxts_ret 505.Lxts_tail_proc: 506 mov OUT,TMPOUT 507 mov IN,TMPIN 508.Lxts_tail_loop: 509 sub $1,TAILNUM 510 movzb -16(TMPOUT),%r10d 511 movzb (TMPIN),%r11d 512 mov %r10b,(TMPOUT) 513 lea 1(TMPIN),TMPIN 514 mov %r11b,-16(TMPOUT) 515 lea 1(TMPOUT),TMPOUT 516 ja .Lxts_tail_loop 517 sub $16,OUT // step 1 block back to save the last stealing block encryption 518 add $16,LTMP 519 vmovdqu (OUT),BLK0 520 jmp .Lxts_enc_proc_1blk_loaded 521 522.Lxts_ret: 523 vmovdqu TWEAK0, (TWEAK) 524 vpxor BLK0, BLK0, BLK0 525 vpxor BLK1, BLK1, BLK1 526 vpxor BLK2, BLK2, BLK2 527 vpxor BLK3, BLK3, BLK3 528 vpxor BLK4, BLK4, BLK4 529 vpxor BLK5, BLK5, BLK5 530 vpxor BLK6, BLK6, BLK6 531 vpxor RDK, RDK, RDK 532 movl $0, RET 533 534 mov %rbp,%rsp 535 add $96,%rsp 536 popq %r15 537 popq %r14 538 popq %r13 539 popq %r12 540 popq %rbp 541 popq %rbx 542 ret 543.cfi_endproc 544.size CRYPT_AES_XTS_Encrypt, .-CRYPT_AES_XTS_Encrypt 545 546 547/** 548 * Function description: Sets the AES decryption and assembly acceleration API in XTS mode. 549 * Function prototype: int32_t CRYPT_AES_XTS_Decrypt(const CRYPT_AES_Key *ctx, 550 * const uint8_t *in, uint8_t *out, uint32_t len); 551 * Input register: 552 * x0: Pointer to the input key structure. 553 * x1: Points to the 128-bit input data. 554 * x2: Indicates the 128-bit output data. 555 * x3: Indicates the length of a data block, that is, 16 bytes. 556 * Change register: xmm1,xmm3,xmm4,xmm5,xmm6,xmm10,xmm11,xmm12,xmm13. 557 * Output register: eax. 558 * Function/Macro Call: None. 559 */ 560.align 32 561.globl CRYPT_AES_XTS_Decrypt 562.type CRYPT_AES_XTS_Decrypt, @function 563CRYPT_AES_XTS_Decrypt: 564.cfi_startproc 565 pushq %rbx 566 pushq %rbp 567 pushq %r12 568 pushq %r13 569 pushq %r14 570 pushq %r15 571 sub $96,%rsp 572 mov %rsp,%rbp 573 and $-16,%rsp // 16 bytes align 574 575 movl LEN, LTMP 576 movl LEN, TAILNUM 577 578 andl $-16,LTMP 579 movl LTMP,WTMP2 580 sub $16,WTMP2 // preserve last and tail block 581 andl $0xf,TAILNUM // LEN % 16 582 cmovg WTMP2,LTMP 583 movl 240(KEY), ROUNDS 584 vmovdqa .Lgfp128(%rip),GFP 585 vmovdqu (TWEAK), TWEAK0 586 shl $4,ROUNDS // roundkey size: rounds*16, except for the last one 587 lea 16(KEY, ROUNDSQ),KEYEND // step to the end of roundkeys 588 589.Lxts_aesdec_start: 590 cmpl $64, LTMP 591 jae .Lxts_dec_above_equal_4_blks 592 cmpl $32, LTMP 593 jae .Lxts_dec_above_equal_2_blks 594 cmpl $0, LTMP 595 je .Lxts_dec_last_2blks 596 jmp .Lxts_dec_proc_1_blk 597 598.Lxts_dec_above_equal_2_blks: 599 cmpl $48, LTMP 600 jb .Lxts_dec_proc_2_blks 601 jmp .Lxts_dec_proc_3_blks 602 603.Lxts_dec_above_equal_4_blks: 604 cmpl $96, LTMP 605 jae .Lxts_dec_proc_6_blks_pre 606 cmpl $80, LTMP 607 jb .Lxts_dec_proc_4_blks 608 jmp .Lxts_dec_proc_5_blks 609 610.align 16 611.Lxts_dec_tail_proc: 612 cmp $0,TAILNUM 613 je .Lxts_aesdec_finish 614 vmovdqa TWEAK1,TWEAK0 // restore back tweak0 615 mov OUT,TMPOUT 616 mov IN,TMPIN 617.Lxts_dec_tail_loop: 618 sub $1,TAILNUM 619 movzb -16(TMPOUT),%r10d 620 movzb (TMPIN),%r11d 621 mov %r10b,(TMPOUT) 622 lea 1(TMPIN),TMPIN 623 mov %r11b,-16(TMPOUT) 624 lea 1(TMPOUT),TMPOUT 625 ja .Lxts_dec_tail_loop 626 627 sub $16,OUT // step 1 block back to save the last stealing block encryption 628 add $16,LTMP 629 630 vmovdqu (OUT),BLK0 631 jmp .Lxts_dec_proc_1blk_loaded 632 633.align 16 634.Lxts_dec_last_2blks: 635 cmp $0,TAILNUM 636 je .Lxts_aesdec_finish 637 vpshufd $0x5f,TWEAK0,TWKTMP 638 vmovdqa TWEAK0,TWEAK1 // tail block use tweak0, last block use tweak1 639 NextTweakCore GFP, TWEAK0, TWKTMP, TMPX 640.Lxts_dec_proc_1_blk: 641 vmovdqu (IN),BLK0 642.Lxts_dec_proc_1blk_loaded: 643 mov KEY,KTMP 644 vpshufd $0x5f,TWEAK0,TWKTMP 645 vmovdqa TWEAK0,TWEAK5 646 movl 240(KTMP), ROUNDS 647 vmovdqu (KTMP), RDK 648 vpxor RDK,BLK0,BLK0 649 decl ROUNDS 650 vpxor TWEAK0, BLK0, BLK0 651 AES_DEC_1_BLK KTMP ROUNDS RDK BLK0 652 vpxor TWEAK0, BLK0, BLK0 653 vmovdqu BLK0, (OUT) 654 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 655 lea 16(IN),IN 656 subl $16,LTMP 657 lea 16(OUT),OUT 658 jl .Lxts_dec_tail_proc 659 jmp .Lxts_aesdec_start 660 661.align 16 662.Lxts_dec_proc_2_blks: 663 mov KEY,KTMP 664 vpshufd $0x5f,TWEAK0,TWKTMP 665 vmovdqa TWEAK0,TWEAK5 666 movl 240(KTMP), ROUNDS 667 vmovdqu (KTMP), RDK 668 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 669 vpxor (IN), RDK, BLK0 670 vpxor 16(IN), RDK, BLK1 671 decl ROUNDS 672 vpxor TWEAK0, BLK0, BLK0 673 vpxor TWEAK1, BLK1, BLK1 674 AES_DEC_2_BLKS KTMP ROUNDS RDK BLK0 BLK1 675 vpxor TWEAK0, BLK0, BLK0 676 vpxor TWEAK1, BLK1, BLK1 677 vmovdqu BLK0, (OUT) 678 vmovdqu BLK1, 16(OUT) 679 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 680 lea 32(IN),IN 681 subl $32,LTMP 682 lea 32(OUT),OUT 683 684 jge .Lxts_aesdec_start 685 686.align 16 687.Lxts_dec_proc_3_blks: 688 mov KEY,KTMP 689 vpshufd $0x5f,TWEAK0,TWKTMP 690 vmovdqa TWEAK0,TWEAK5 691 movl 240(KTMP), ROUNDS 692 vmovdqu (KTMP), RDK 693 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 694 vpxor (IN), RDK, BLK0 695 vpxor 16(IN), RDK, BLK1 696 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 697 vpxor 32(IN), RDK, BLK2 698 decl ROUNDS 699 vpxor TWEAK0, BLK0, BLK0 700 vpxor TWEAK1, BLK1, BLK1 701 vpxor TWEAK2, BLK2, BLK2 702 AES_DEC_3_BLKS KTMP ROUNDS RDK BLK0 BLK1 BLK2 703 vpxor TWEAK0, BLK0, BLK0 704 vpxor TWEAK1, BLK1, BLK1 705 vpxor TWEAK2, BLK2, BLK2 706 vmovdqu BLK0, (OUT) 707 vmovdqu BLK1, 16(OUT) 708 vmovdqu BLK2, 32(OUT) 709 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 710 lea 48(IN),IN 711 subl $48,LTMP 712 lea 48(OUT),OUT 713 jge .Lxts_aesdec_start 714 715.align 16 716.Lxts_dec_proc_4_blks: 717 mov KEY,KTMP 718 vpshufd $0x5f,TWEAK0,TWKTMP 719 vmovdqa TWEAK0,TWEAK5 720 movl 240(KTMP), ROUNDS 721 vmovdqu (KTMP), RDK 722 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 723 vpxor (IN), RDK, BLK0 724 vpxor 16(IN), RDK, BLK1 725 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 726 vpxor 32(IN), RDK, BLK2 727 NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX 728 vpxor 48(IN), RDK, BLK3 729 decl ROUNDS 730 vpxor TWEAK0, BLK0, BLK0 731 vpxor TWEAK1, BLK1, BLK1 732 vpxor TWEAK2, BLK2, BLK2 733 vpxor TWEAK3, BLK3, BLK3 734 AES_DEC_4_BLKS KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3 735 vpxor TWEAK0, BLK0, BLK0 736 vpxor TWEAK1, BLK1, BLK1 737 vpxor TWEAK2, BLK2, BLK2 738 vpxor TWEAK3, BLK3, BLK3 739 vmovdqu BLK0, (OUT) 740 vmovdqu BLK1, 16(OUT) 741 vmovdqu BLK2, 32(OUT) 742 vmovdqu BLK3, 48(OUT) 743 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 744 lea 64(IN),IN 745 subl $64,LTMP 746 lea 64(OUT),OUT 747 jge .Lxts_aesdec_start 748 749.align 16 750.Lxts_dec_proc_5_blks: 751 mov KEY,KTMP 752 vpshufd $0x5f,TWEAK0,TWKTMP 753 vmovdqa TWEAK0,TWEAK5 754 movl 240(KTMP), ROUNDS 755 vmovdqu (KTMP), RDK 756 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 757 vpxor (IN), RDK, BLK0 758 vpxor 16(IN), RDK, BLK1 759 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 760 vpxor 32(IN), RDK, BLK2 761 NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX 762 vpxor 48(IN), RDK, BLK3 763 NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX 764 vpxor 64(IN), RDK, BLK4 765 decl ROUNDS 766 vpxor TWEAK0, BLK0, BLK0 767 vpxor TWEAK1, BLK1, BLK1 768 vpxor TWEAK2, BLK2, BLK2 769 vpxor TWEAK3, BLK3, BLK3 770 vpxor TWEAK4, BLK4, BLK4 771 AES_DEC_5_BLKS KTMP ROUNDS RDK BLK0 BLK1 BLK2 BLK3 BLK4 772 vpxor TWEAK0, BLK0, BLK0 773 vpxor TWEAK1, BLK1, BLK1 774 vpxor TWEAK2, BLK2, BLK2 775 vpxor TWEAK3, BLK3, BLK3 776 vpxor TWEAK4, BLK4, BLK4 777 vmovdqu BLK0, (OUT) 778 vmovdqu BLK1, 16(OUT) 779 vmovdqu BLK2, 32(OUT) 780 vmovdqu BLK3, 48(OUT) 781 vmovdqu BLK4, 64(OUT) 782 NextTweak GFP, TWEAK5, TWEAK0, TWKTMP, TMPX 783 lea 80(IN),IN 784 subl $80,LTMP 785 lea 80(OUT),OUT 786 jge .Lxts_aesdec_start 787 788.align 32 789.Lxts_dec_proc_6_blks_pre: 790 vpshufd $0x5f,TWEAK0,TWKTMP // save higher doubleword of tweak 791 vmovdqa TWEAK0,TWEAK5 // copy first tweak 792 NextTweak GFP, TWEAK5, TWEAK1, TWKTMP, TMPX 793 NextTweak GFP, TWEAK5, TWEAK2, TWKTMP, TMPX 794 NextTweak GFP, TWEAK5, TWEAK3, TWKTMP, TMPX 795 NextTweak GFP, TWEAK5, TWEAK4, TWKTMP, TMPX 796 NextTweakCore GFP, TWEAK5, TWKTMP, TMPX 797.align 32 798.Lxts_dec_proc_6_blks: 799 vmovdqu (KEY), RDK 800 vmovdqu (IN),BLK0 801 vpxor TWEAK0,BLK0,BLK0 // blk0 ^= tweak0 802 vpxor RDK,BLK0,BLK0 // blk0 = blk0 ^ tweak0 ^ rk0, prepared for the loop round 803 vmovdqu -16(KEYEND),RDK1 // load last round key 804 805 vmovdqu 16(IN),BLK1 806 vpxor RDK1,TWEAK0,TWEAK0 807 aesdec 16(KEY),BLK0 // first round: rk1 808 vmovdqa TWEAK0,(%rsp) 809 vpxor TWEAK1,BLK1,BLK1 810 vpxor RDK,BLK1,BLK1 811 812 vmovdqu 32(IN),BLK2 813 vpxor RDK1,TWEAK1,TWEAK1 814 aesdec 16(KEY),BLK1 815 vmovdqa TWEAK1,16(%rsp) 816 vpxor TWEAK2,BLK2,BLK2 817 vpxor RDK,BLK2,BLK2 818 819 vmovdqu 48(IN),BLK3 820 vpxor RDK1,TWEAK2,TWEAK2 821 aesdec 16(KEY),BLK2 822 vmovdqa TWEAK2,32(%rsp) 823 vpxor TWEAK3,BLK3,BLK3 824 vpxor RDK,BLK3,BLK3 825 826 vmovdqu 64(IN),BLK4 827 vpxor RDK1,TWEAK3,TWEAK3 828 aesdec 16(KEY),BLK3 829 vmovdqa TWEAK3,48(%rsp) 830 vpxor TWEAK4,BLK4,BLK4 831 vpxor RDK,BLK4,BLK4 832 833 vmovdqu 80(IN),BLK5 834 vpxor RDK1,TWEAK4,TWEAK4 835 aesdec 16(KEY),BLK4 836 vmovdqa TWEAK4,64(%rsp) 837 vpxor TWEAK5,BLK5,BLK5 838 vpxor RDK,BLK5,BLK5 839 vpxor RDK1,TWEAK5,TWEAK5 840 aesdec 16(KEY),BLK5 841 vmovdqa TWEAK5,80(%rsp) 842 843 mov $(7*16),TROUNDS // loop 7 rounds 844 sub ROUNDSQ,TROUNDS 845.align 32 846.Lxts_dec_6blks_loop: 847 vmovdqu -96(KEYEND,TROUNDS),RDK // left 5+1 block to interval 848 aesdec RDK, BLK0 849 aesdec RDK, BLK1 850 aesdec RDK, BLK2 851 add $16,TROUNDS 852 aesdec RDK, BLK3 853 aesdec RDK, BLK4 854 aesdec RDK, BLK5 855 jnz .Lxts_dec_6blks_loop 856 857 vpxor 80(%rsp),RDK1,TWEAK5 // tweak5 = tweak5^lastroundkey^lastroundkey 858 vmovdqu -96(KEYEND,TROUNDS),RDK 859 vpshufd $0x5f,TWEAK5,TWKTMP // use new tweak-tmp 860 vmovdqa TWKTMP,TMPX // pre-calculate next round tweak0~tweak5 861 aesdec RDK, BLK0 862 vpaddd TWKTMP,TWKTMP,TWKTMP 863 vpsrad $31,TMPX,TMPX 864 aesdec RDK, BLK1 865 vpaddq TWEAK5,TWEAK5,TWEAK5 866 vpand GFP,TMPX,TMPX 867 aesdec RDK, BLK2 868 vpxor TMPX,TWEAK5,TWEAK5 869 add $16,TROUNDS 870 aesdec RDK, BLK3 871 vmovdqa TWEAK5,TWEAK0 872 aesdec RDK, BLK4 873 aesdec RDK, BLK5 874 875 vmovdqu -96(KEYEND,TROUNDS),RDK 876 vmovdqa TWKTMP,TMPX 877 aesdec RDK, BLK0 878 vpaddd TWKTMP,TWKTMP,TWKTMP 879 vpsrad $31,TMPX,TMPX 880 aesdec RDK, BLK1 881 vpaddq TWEAK5,TWEAK5,TWEAK5 882 vpand GFP,TMPX,TMPX 883 aesdec RDK, BLK2 884 vpxor TMPX,TWEAK5,TWEAK5 885 add $16,TROUNDS 886 aesdec RDK, BLK3 887 vmovdqa TWEAK5,TWEAK1 888 aesdec RDK, BLK4 889 aesdec RDK, BLK5 890 891 vmovdqu -96(KEYEND,TROUNDS),RDK 892 vmovdqa TWKTMP,TMPX 893 aesdec RDK, BLK0 894 vpaddd TWKTMP,TWKTMP,TWKTMP 895 vpsrad $31,TMPX,TMPX 896 aesdec RDK, BLK1 897 vpaddq TWEAK5,TWEAK5,TWEAK5 898 vpand GFP,TMPX,TMPX 899 aesdec RDK, BLK2 900 vpxor TMPX,TWEAK5,TWEAK5 901 add $16,TROUNDS 902 aesdec RDK, BLK3 903 vmovdqa TWEAK5,TWEAK2 904 aesdec RDK, BLK4 905 aesdec RDK, BLK5 906 907 vmovdqu -96(KEYEND,TROUNDS),RDK 908 vmovdqa TWKTMP,TMPX 909 aesdec RDK, BLK0 910 vpaddd TWKTMP,TWKTMP,TWKTMP 911 vpsrad $31,TMPX,TMPX 912 aesdec RDK, BLK1 913 vpaddq TWEAK5,TWEAK5,TWEAK5 914 vpand GFP,TMPX,TMPX 915 aesdec RDK, BLK2 916 vpxor TMPX,TWEAK5,TWEAK5 917 add $16,TROUNDS 918 aesdec RDK, BLK3 919 vmovdqa TWEAK5,TWEAK3 920 aesdec RDK, BLK4 921 aesdec RDK, BLK5 922 923 vmovdqu -96(KEYEND,TROUNDS),RDK 924 vmovdqa TWKTMP,TMPX 925 aesdec RDK, BLK0 926 vpaddd TWKTMP,TWKTMP,TWKTMP 927 vpsrad $31,TMPX,TMPX 928 aesdec RDK, BLK1 929 vpaddq TWEAK5,TWEAK5,TWEAK5 930 vpand GFP,TMPX,TMPX 931 aesdec RDK, BLK2 932 vpxor TMPX,TWEAK5,TWEAK5 933 aesdec RDK, BLK3 934 vmovdqa TWEAK5,TWEAK4 935 aesdec RDK, BLK4 936 aesdec RDK, BLK5 937 938 vmovdqa TWKTMP,TMPX 939 aesdeclast (%rsp), BLK0 940 aesdeclast 16(%rsp), BLK1 // already do the tweak^lastround, so here just aesdeclast 941 vpaddd TWKTMP,TWKTMP,TWKTMP 942 vpsrad $31,TMPX,TMPX 943 aesdeclast 32(%rsp), BLK2 944 vpaddq TWEAK5,TWEAK5,TWEAK5 945 vpand GFP,TMPX,TMPX 946 aesdeclast 48(%rsp), BLK3 947 vpxor TMPX,TWEAK5,TWEAK5 948 aesdeclast 64(%rsp), BLK4 949 aesdeclast 80(%rsp), BLK5 950 951 vmovdqu BLK0, (OUT) 952 vmovdqu BLK1, 16(OUT) 953 vmovdqu BLK2, 32(OUT) 954 vmovdqu BLK3, 48(OUT) 955 vmovdqu BLK4, 64(OUT) 956 vmovdqu BLK5, 80(OUT) 957 958 leaq 96(IN), IN 959 leaq 96(OUT), OUT 960 sub $96, LTMP 961 cmp $96, LTMP 962 jb .Lxts_aesdec_start 963 jmp .Lxts_dec_proc_6_blks 964 965.align 16 966.Lxts_aesdec_finish: 967 vmovdqu TWEAK0, (TWEAK) 968 vpxor BLK0, BLK0, BLK0 969 vpxor BLK1, BLK1, BLK1 970 vpxor BLK2, BLK2, BLK2 971 vpxor BLK3, BLK3, BLK3 972 vpxor BLK4, BLK4, BLK4 973 vpxor BLK5, BLK5, BLK5 974 vpxor BLK6, BLK6, BLK6 975 vpxor RDK, RDK, RDK 976 movl $0, RET 977 978 mov %rbp,%rsp 979 add $96,%rsp 980 popq %r15 981 popq %r14 982 popq %r13 983 popq %r12 984 popq %rbp 985 popq %rbx 986 ret 987.cfi_endproc 988.size CRYPT_AES_XTS_Decrypt, .-CRYPT_AES_XTS_Decrypt 989 990#endif 991