1/* 2 * Implement AES algorithm in Intel AES-NI instructions. 3 * 4 * The white paper of AES-NI instructions can be downloaded from: 5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 6 * 7 * Copyright (C) 2008, Intel Corp. 8 * Author: Huang Ying <ying.huang@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com> 10 * Kahraman Akdemir 11 * 12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 13 * interface for 64-bit kernels. 14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 15 * Aidan O'Mahony (aidan.o.mahony@intel.com) 16 * Adrian Hoban <adrian.hoban@intel.com> 17 * James Guilford (james.guilford@intel.com) 18 * Gabriele Paoloni <gabriele.paoloni@intel.com> 19 * Tadeusz Struk (tadeusz.struk@intel.com) 20 * Wajdi Feghali (wajdi.k.feghali@intel.com) 21 * Copyright (c) 2010, Intel Corporation. 22 * 23 * Ported x86_64 version to x86: 24 * Author: Mathias Krause <minipli@googlemail.com> 25 * 26 * This program is free software; you can redistribute it and/or modify 27 * it under the terms of the GNU General Public License as published by 28 * the Free Software Foundation; either version 2 of the License, or 29 * (at your option) any later version. 30 */ 31 32#include <linux/linkage.h> 33#include <asm/inst.h> 34#include <asm/nospec-branch.h> 35 36/* 37 * The following macros are used to move an (un)aligned 16 byte value to/from 38 * an XMM register. This can done for either FP or integer values, for FP use 39 * movaps (move aligned packed single) or integer use movdqa (move double quad 40 * aligned). It doesn't make a performance difference which instruction is used 41 * since Nehalem (original Core i7) was released. However, the movaps is a byte 42 * shorter, so that is the one we'll use for now. (same for unaligned). 43 */ 44#define MOVADQ movaps 45#define MOVUDQ movups 46 47#ifdef __x86_64__ 48 49.data 50.align 16 51.Lgf128mul_x_ble_mask: 52 .octa 0x00000000000000010000000000000087 53POLY: .octa 0xC2000000000000000000000000000001 54TWOONE: .octa 0x00000001000000000000000000000001 55 56# order of these constants should not change. 57# more specifically, ALL_F should follow SHIFT_MASK, 58# and ZERO should follow ALL_F 59 60SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 61MASK1: .octa 0x0000000000000000ffffffffffffffff 62MASK2: .octa 0xffffffffffffffff0000000000000000 63SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 64ALL_F: .octa 0xffffffffffffffffffffffffffffffff 65ZERO: .octa 0x00000000000000000000000000000000 66ONE: .octa 0x00000000000000000000000000000001 67F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 68dec: .octa 0x1 69enc: .octa 0x2 70 71 72.text 73 74 75#define STACK_OFFSET 8*3 76#define HashKey 16*0 // store HashKey <<1 mod poly here 77#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here 78#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here 79#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here 80#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 81 // bits of HashKey <<1 mod poly here 82 //(for Karatsuba purposes) 83#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 84 // bits of HashKey^2 <<1 mod poly here 85 // (for Karatsuba purposes) 86#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 87 // bits of HashKey^3 <<1 mod poly here 88 // (for Karatsuba purposes) 89#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 90 // bits of HashKey^4 <<1 mod poly here 91 // (for Karatsuba purposes) 92#define VARIABLE_OFFSET 16*8 93 94#define arg1 rdi 95#define arg2 rsi 96#define arg3 rdx 97#define arg4 rcx 98#define arg5 r8 99#define arg6 r9 100#define arg7 STACK_OFFSET+8(%r14) 101#define arg8 STACK_OFFSET+16(%r14) 102#define arg9 STACK_OFFSET+24(%r14) 103#define arg10 STACK_OFFSET+32(%r14) 104#define keysize 2*15*16(%arg1) 105#endif 106 107 108#define STATE1 %xmm0 109#define STATE2 %xmm4 110#define STATE3 %xmm5 111#define STATE4 %xmm6 112#define STATE STATE1 113#define IN1 %xmm1 114#define IN2 %xmm7 115#define IN3 %xmm8 116#define IN4 %xmm9 117#define IN IN1 118#define KEY %xmm2 119#define IV %xmm3 120 121#define BSWAP_MASK %xmm10 122#define CTR %xmm11 123#define INC %xmm12 124 125#define GF128MUL_MASK %xmm10 126 127#ifdef __x86_64__ 128#define AREG %rax 129#define KEYP %rdi 130#define OUTP %rsi 131#define UKEYP OUTP 132#define INP %rdx 133#define LEN %rcx 134#define IVP %r8 135#define KLEN %r9d 136#define T1 %r10 137#define TKEYP T1 138#define T2 %r11 139#define TCTR_LOW T2 140#else 141#define AREG %eax 142#define KEYP %edi 143#define OUTP AREG 144#define UKEYP OUTP 145#define INP %edx 146#define LEN %esi 147#define IVP %ebp 148#define KLEN %ebx 149#define T1 %ecx 150#define TKEYP T1 151#endif 152 153 154#ifdef __x86_64__ 155/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 156* 157* 158* Input: A and B (128-bits each, bit-reflected) 159* Output: C = A*B*x mod poly, (i.e. >>1 ) 160* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 161* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 162* 163*/ 164.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 165 movdqa \GH, \TMP1 166 pshufd $78, \GH, \TMP2 167 pshufd $78, \HK, \TMP3 168 pxor \GH, \TMP2 # TMP2 = a1+a0 169 pxor \HK, \TMP3 # TMP3 = b1+b0 170 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 171 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 172 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 173 pxor \GH, \TMP2 174 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 175 movdqa \TMP2, \TMP3 176 pslldq $8, \TMP3 # left shift TMP3 2 DWs 177 psrldq $8, \TMP2 # right shift TMP2 2 DWs 178 pxor \TMP3, \GH 179 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 180 181 # first phase of the reduction 182 183 movdqa \GH, \TMP2 184 movdqa \GH, \TMP3 185 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 186 # in in order to perform 187 # independent shifts 188 pslld $31, \TMP2 # packed right shift <<31 189 pslld $30, \TMP3 # packed right shift <<30 190 pslld $25, \TMP4 # packed right shift <<25 191 pxor \TMP3, \TMP2 # xor the shifted versions 192 pxor \TMP4, \TMP2 193 movdqa \TMP2, \TMP5 194 psrldq $4, \TMP5 # right shift TMP5 1 DW 195 pslldq $12, \TMP2 # left shift TMP2 3 DWs 196 pxor \TMP2, \GH 197 198 # second phase of the reduction 199 200 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 201 # in in order to perform 202 # independent shifts 203 movdqa \GH,\TMP3 204 movdqa \GH,\TMP4 205 psrld $1,\TMP2 # packed left shift >>1 206 psrld $2,\TMP3 # packed left shift >>2 207 psrld $7,\TMP4 # packed left shift >>7 208 pxor \TMP3,\TMP2 # xor the shifted versions 209 pxor \TMP4,\TMP2 210 pxor \TMP5, \TMP2 211 pxor \TMP2, \GH 212 pxor \TMP1, \GH # result is in TMP1 213.endm 214 215/* 216* if a = number of total plaintext bytes 217* b = floor(a/16) 218* num_initial_blocks = b mod 4 219* encrypt the initial num_initial_blocks blocks and apply ghash on 220* the ciphertext 221* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 222* are clobbered 223* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 224*/ 225 226 227.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 228XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 229 MOVADQ SHUF_MASK(%rip), %xmm14 230 mov arg7, %r10 # %r10 = AAD 231 mov arg8, %r12 # %r12 = aadLen 232 mov %r12, %r11 233 pxor %xmm\i, %xmm\i 234 235_get_AAD_loop\num_initial_blocks\operation: 236 movd (%r10), \TMP1 237 pslldq $12, \TMP1 238 psrldq $4, %xmm\i 239 pxor \TMP1, %xmm\i 240 add $4, %r10 241 sub $4, %r12 242 jne _get_AAD_loop\num_initial_blocks\operation 243 244 cmp $16, %r11 245 je _get_AAD_loop2_done\num_initial_blocks\operation 246 247 mov $16, %r12 248_get_AAD_loop2\num_initial_blocks\operation: 249 psrldq $4, %xmm\i 250 sub $4, %r12 251 cmp %r11, %r12 252 jne _get_AAD_loop2\num_initial_blocks\operation 253 254_get_AAD_loop2_done\num_initial_blocks\operation: 255 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 256 257 xor %r11, %r11 # initialise the data pointer offset as zero 258 259 # start AES for num_initial_blocks blocks 260 261 mov %arg5, %rax # %rax = *Y0 262 movdqu (%rax), \XMM0 # XMM0 = Y0 263 PSHUFB_XMM %xmm14, \XMM0 264 265.if (\i == 5) || (\i == 6) || (\i == 7) 266 MOVADQ ONE(%RIP),\TMP1 267 MOVADQ (%arg1),\TMP2 268.irpc index, \i_seq 269 paddd \TMP1, \XMM0 # INCR Y0 270 movdqa \XMM0, %xmm\index 271 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 272 pxor \TMP2, %xmm\index 273.endr 274 lea 0x10(%arg1),%r10 275 mov keysize,%eax 276 shr $2,%eax # 128->4, 192->6, 256->8 277 add $5,%eax # 128->9, 192->11, 256->13 278 279aes_loop_initial_dec\num_initial_blocks: 280 MOVADQ (%r10),\TMP1 281.irpc index, \i_seq 282 AESENC \TMP1, %xmm\index 283.endr 284 add $16,%r10 285 sub $1,%eax 286 jnz aes_loop_initial_dec\num_initial_blocks 287 288 MOVADQ (%r10), \TMP1 289.irpc index, \i_seq 290 AESENCLAST \TMP1, %xmm\index # Last Round 291.endr 292.irpc index, \i_seq 293 movdqu (%arg3 , %r11, 1), \TMP1 294 pxor \TMP1, %xmm\index 295 movdqu %xmm\index, (%arg2 , %r11, 1) 296 # write back plaintext/ciphertext for num_initial_blocks 297 add $16, %r11 298 299 movdqa \TMP1, %xmm\index 300 PSHUFB_XMM %xmm14, %xmm\index 301 # prepare plaintext/ciphertext for GHASH computation 302.endr 303.endif 304 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 305 # apply GHASH on num_initial_blocks blocks 306 307.if \i == 5 308 pxor %xmm5, %xmm6 309 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 310 pxor %xmm6, %xmm7 311 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 312 pxor %xmm7, %xmm8 313 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 314.elseif \i == 6 315 pxor %xmm6, %xmm7 316 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 317 pxor %xmm7, %xmm8 318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 319.elseif \i == 7 320 pxor %xmm7, %xmm8 321 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 322.endif 323 cmp $64, %r13 324 jl _initial_blocks_done\num_initial_blocks\operation 325 # no need for precomputed values 326/* 327* 328* Precomputations for HashKey parallel with encryption of first 4 blocks. 329* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 330*/ 331 MOVADQ ONE(%rip), \TMP1 332 paddd \TMP1, \XMM0 # INCR Y0 333 MOVADQ \XMM0, \XMM1 334 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 335 336 paddd \TMP1, \XMM0 # INCR Y0 337 MOVADQ \XMM0, \XMM2 338 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 339 340 paddd \TMP1, \XMM0 # INCR Y0 341 MOVADQ \XMM0, \XMM3 342 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 343 344 paddd \TMP1, \XMM0 # INCR Y0 345 MOVADQ \XMM0, \XMM4 346 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 347 348 MOVADQ 0(%arg1),\TMP1 349 pxor \TMP1, \XMM1 350 pxor \TMP1, \XMM2 351 pxor \TMP1, \XMM3 352 pxor \TMP1, \XMM4 353 movdqa \TMP3, \TMP5 354 pshufd $78, \TMP3, \TMP1 355 pxor \TMP3, \TMP1 356 movdqa \TMP1, HashKey_k(%rsp) 357 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 358# TMP5 = HashKey^2<<1 (mod poly) 359 movdqa \TMP5, HashKey_2(%rsp) 360# HashKey_2 = HashKey^2<<1 (mod poly) 361 pshufd $78, \TMP5, \TMP1 362 pxor \TMP5, \TMP1 363 movdqa \TMP1, HashKey_2_k(%rsp) 364.irpc index, 1234 # do 4 rounds 365 movaps 0x10*\index(%arg1), \TMP1 366 AESENC \TMP1, \XMM1 367 AESENC \TMP1, \XMM2 368 AESENC \TMP1, \XMM3 369 AESENC \TMP1, \XMM4 370.endr 371 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 372# TMP5 = HashKey^3<<1 (mod poly) 373 movdqa \TMP5, HashKey_3(%rsp) 374 pshufd $78, \TMP5, \TMP1 375 pxor \TMP5, \TMP1 376 movdqa \TMP1, HashKey_3_k(%rsp) 377.irpc index, 56789 # do next 5 rounds 378 movaps 0x10*\index(%arg1), \TMP1 379 AESENC \TMP1, \XMM1 380 AESENC \TMP1, \XMM2 381 AESENC \TMP1, \XMM3 382 AESENC \TMP1, \XMM4 383.endr 384 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 385# TMP5 = HashKey^3<<1 (mod poly) 386 movdqa \TMP5, HashKey_4(%rsp) 387 pshufd $78, \TMP5, \TMP1 388 pxor \TMP5, \TMP1 389 movdqa \TMP1, HashKey_4_k(%rsp) 390 lea 0xa0(%arg1),%r10 391 mov keysize,%eax 392 shr $2,%eax # 128->4, 192->6, 256->8 393 sub $4,%eax # 128->0, 192->2, 256->4 394 jz aes_loop_pre_dec_done\num_initial_blocks 395 396aes_loop_pre_dec\num_initial_blocks: 397 MOVADQ (%r10),\TMP2 398.irpc index, 1234 399 AESENC \TMP2, %xmm\index 400.endr 401 add $16,%r10 402 sub $1,%eax 403 jnz aes_loop_pre_dec\num_initial_blocks 404 405aes_loop_pre_dec_done\num_initial_blocks: 406 MOVADQ (%r10), \TMP2 407 AESENCLAST \TMP2, \XMM1 408 AESENCLAST \TMP2, \XMM2 409 AESENCLAST \TMP2, \XMM3 410 AESENCLAST \TMP2, \XMM4 411 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 412 pxor \TMP1, \XMM1 413 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 414 movdqa \TMP1, \XMM1 415 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 416 pxor \TMP1, \XMM2 417 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 418 movdqa \TMP1, \XMM2 419 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 420 pxor \TMP1, \XMM3 421 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 422 movdqa \TMP1, \XMM3 423 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 424 pxor \TMP1, \XMM4 425 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 426 movdqa \TMP1, \XMM4 427 add $64, %r11 428 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 429 pxor \XMMDst, \XMM1 430# combine GHASHed value with the corresponding ciphertext 431 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 432 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 434 435_initial_blocks_done\num_initial_blocks\operation: 436 437.endm 438 439 440/* 441* if a = number of total plaintext bytes 442* b = floor(a/16) 443* num_initial_blocks = b mod 4 444* encrypt the initial num_initial_blocks blocks and apply ghash on 445* the ciphertext 446* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 447* are clobbered 448* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified 449*/ 450 451 452.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 453XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 454 MOVADQ SHUF_MASK(%rip), %xmm14 455 mov arg7, %r10 # %r10 = AAD 456 mov arg8, %r12 # %r12 = aadLen 457 mov %r12, %r11 458 pxor %xmm\i, %xmm\i 459_get_AAD_loop\num_initial_blocks\operation: 460 movd (%r10), \TMP1 461 pslldq $12, \TMP1 462 psrldq $4, %xmm\i 463 pxor \TMP1, %xmm\i 464 add $4, %r10 465 sub $4, %r12 466 jne _get_AAD_loop\num_initial_blocks\operation 467 cmp $16, %r11 468 je _get_AAD_loop2_done\num_initial_blocks\operation 469 mov $16, %r12 470_get_AAD_loop2\num_initial_blocks\operation: 471 psrldq $4, %xmm\i 472 sub $4, %r12 473 cmp %r11, %r12 474 jne _get_AAD_loop2\num_initial_blocks\operation 475_get_AAD_loop2_done\num_initial_blocks\operation: 476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data 477 478 xor %r11, %r11 # initialise the data pointer offset as zero 479 480 # start AES for num_initial_blocks blocks 481 482 mov %arg5, %rax # %rax = *Y0 483 movdqu (%rax), \XMM0 # XMM0 = Y0 484 PSHUFB_XMM %xmm14, \XMM0 485 486.if (\i == 5) || (\i == 6) || (\i == 7) 487 488 MOVADQ ONE(%RIP),\TMP1 489 MOVADQ 0(%arg1),\TMP2 490.irpc index, \i_seq 491 paddd \TMP1, \XMM0 # INCR Y0 492 MOVADQ \XMM0, %xmm\index 493 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap 494 pxor \TMP2, %xmm\index 495.endr 496 lea 0x10(%arg1),%r10 497 mov keysize,%eax 498 shr $2,%eax # 128->4, 192->6, 256->8 499 add $5,%eax # 128->9, 192->11, 256->13 500 501aes_loop_initial_enc\num_initial_blocks: 502 MOVADQ (%r10),\TMP1 503.irpc index, \i_seq 504 AESENC \TMP1, %xmm\index 505.endr 506 add $16,%r10 507 sub $1,%eax 508 jnz aes_loop_initial_enc\num_initial_blocks 509 510 MOVADQ (%r10), \TMP1 511.irpc index, \i_seq 512 AESENCLAST \TMP1, %xmm\index # Last Round 513.endr 514.irpc index, \i_seq 515 movdqu (%arg3 , %r11, 1), \TMP1 516 pxor \TMP1, %xmm\index 517 movdqu %xmm\index, (%arg2 , %r11, 1) 518 # write back plaintext/ciphertext for num_initial_blocks 519 add $16, %r11 520 PSHUFB_XMM %xmm14, %xmm\index 521 522 # prepare plaintext/ciphertext for GHASH computation 523.endr 524.endif 525 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 526 # apply GHASH on num_initial_blocks blocks 527 528.if \i == 5 529 pxor %xmm5, %xmm6 530 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 531 pxor %xmm6, %xmm7 532 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 533 pxor %xmm7, %xmm8 534 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 535.elseif \i == 6 536 pxor %xmm6, %xmm7 537 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 538 pxor %xmm7, %xmm8 539 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 540.elseif \i == 7 541 pxor %xmm7, %xmm8 542 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 543.endif 544 cmp $64, %r13 545 jl _initial_blocks_done\num_initial_blocks\operation 546 # no need for precomputed values 547/* 548* 549* Precomputations for HashKey parallel with encryption of first 4 blocks. 550* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 551*/ 552 MOVADQ ONE(%RIP),\TMP1 553 paddd \TMP1, \XMM0 # INCR Y0 554 MOVADQ \XMM0, \XMM1 555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 556 557 paddd \TMP1, \XMM0 # INCR Y0 558 MOVADQ \XMM0, \XMM2 559 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 560 561 paddd \TMP1, \XMM0 # INCR Y0 562 MOVADQ \XMM0, \XMM3 563 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 564 565 paddd \TMP1, \XMM0 # INCR Y0 566 MOVADQ \XMM0, \XMM4 567 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 568 569 MOVADQ 0(%arg1),\TMP1 570 pxor \TMP1, \XMM1 571 pxor \TMP1, \XMM2 572 pxor \TMP1, \XMM3 573 pxor \TMP1, \XMM4 574 movdqa \TMP3, \TMP5 575 pshufd $78, \TMP3, \TMP1 576 pxor \TMP3, \TMP1 577 movdqa \TMP1, HashKey_k(%rsp) 578 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 579# TMP5 = HashKey^2<<1 (mod poly) 580 movdqa \TMP5, HashKey_2(%rsp) 581# HashKey_2 = HashKey^2<<1 (mod poly) 582 pshufd $78, \TMP5, \TMP1 583 pxor \TMP5, \TMP1 584 movdqa \TMP1, HashKey_2_k(%rsp) 585.irpc index, 1234 # do 4 rounds 586 movaps 0x10*\index(%arg1), \TMP1 587 AESENC \TMP1, \XMM1 588 AESENC \TMP1, \XMM2 589 AESENC \TMP1, \XMM3 590 AESENC \TMP1, \XMM4 591.endr 592 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 593# TMP5 = HashKey^3<<1 (mod poly) 594 movdqa \TMP5, HashKey_3(%rsp) 595 pshufd $78, \TMP5, \TMP1 596 pxor \TMP5, \TMP1 597 movdqa \TMP1, HashKey_3_k(%rsp) 598.irpc index, 56789 # do next 5 rounds 599 movaps 0x10*\index(%arg1), \TMP1 600 AESENC \TMP1, \XMM1 601 AESENC \TMP1, \XMM2 602 AESENC \TMP1, \XMM3 603 AESENC \TMP1, \XMM4 604.endr 605 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 606# TMP5 = HashKey^3<<1 (mod poly) 607 movdqa \TMP5, HashKey_4(%rsp) 608 pshufd $78, \TMP5, \TMP1 609 pxor \TMP5, \TMP1 610 movdqa \TMP1, HashKey_4_k(%rsp) 611 lea 0xa0(%arg1),%r10 612 mov keysize,%eax 613 shr $2,%eax # 128->4, 192->6, 256->8 614 sub $4,%eax # 128->0, 192->2, 256->4 615 jz aes_loop_pre_enc_done\num_initial_blocks 616 617aes_loop_pre_enc\num_initial_blocks: 618 MOVADQ (%r10),\TMP2 619.irpc index, 1234 620 AESENC \TMP2, %xmm\index 621.endr 622 add $16,%r10 623 sub $1,%eax 624 jnz aes_loop_pre_enc\num_initial_blocks 625 626aes_loop_pre_enc_done\num_initial_blocks: 627 MOVADQ (%r10), \TMP2 628 AESENCLAST \TMP2, \XMM1 629 AESENCLAST \TMP2, \XMM2 630 AESENCLAST \TMP2, \XMM3 631 AESENCLAST \TMP2, \XMM4 632 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 633 pxor \TMP1, \XMM1 634 movdqu 16*1(%arg3 , %r11 , 1), \TMP1 635 pxor \TMP1, \XMM2 636 movdqu 16*2(%arg3 , %r11 , 1), \TMP1 637 pxor \TMP1, \XMM3 638 movdqu 16*3(%arg3 , %r11 , 1), \TMP1 639 pxor \TMP1, \XMM4 640 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) 641 movdqu \XMM2, 16*1(%arg2 , %r11 , 1) 642 movdqu \XMM3, 16*2(%arg2 , %r11 , 1) 643 movdqu \XMM4, 16*3(%arg2 , %r11 , 1) 644 645 add $64, %r11 646 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap 647 pxor \XMMDst, \XMM1 648# combine GHASHed value with the corresponding ciphertext 649 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap 650 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap 651 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap 652 653_initial_blocks_done\num_initial_blocks\operation: 654 655.endm 656 657/* 658* encrypt 4 blocks at a time 659* ghash the 4 previously encrypted ciphertext blocks 660* arg1, %arg2, %arg3 are used as pointers only, not modified 661* %r11 is the data offset value 662*/ 663.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ 664TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 665 666 movdqa \XMM1, \XMM5 667 movdqa \XMM2, \XMM6 668 movdqa \XMM3, \XMM7 669 movdqa \XMM4, \XMM8 670 671 movdqa SHUF_MASK(%rip), %xmm15 672 # multiply TMP5 * HashKey using karatsuba 673 674 movdqa \XMM5, \TMP4 675 pshufd $78, \XMM5, \TMP6 676 pxor \XMM5, \TMP6 677 paddd ONE(%rip), \XMM0 # INCR CNT 678 movdqa HashKey_4(%rsp), \TMP5 679 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 680 movdqa \XMM0, \XMM1 681 paddd ONE(%rip), \XMM0 # INCR CNT 682 movdqa \XMM0, \XMM2 683 paddd ONE(%rip), \XMM0 # INCR CNT 684 movdqa \XMM0, \XMM3 685 paddd ONE(%rip), \XMM0 # INCR CNT 686 movdqa \XMM0, \XMM4 687 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 688 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 689 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 690 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 691 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 692 693 pxor (%arg1), \XMM1 694 pxor (%arg1), \XMM2 695 pxor (%arg1), \XMM3 696 pxor (%arg1), \XMM4 697 movdqa HashKey_4_k(%rsp), \TMP5 698 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 699 movaps 0x10(%arg1), \TMP1 700 AESENC \TMP1, \XMM1 # Round 1 701 AESENC \TMP1, \XMM2 702 AESENC \TMP1, \XMM3 703 AESENC \TMP1, \XMM4 704 movaps 0x20(%arg1), \TMP1 705 AESENC \TMP1, \XMM1 # Round 2 706 AESENC \TMP1, \XMM2 707 AESENC \TMP1, \XMM3 708 AESENC \TMP1, \XMM4 709 movdqa \XMM6, \TMP1 710 pshufd $78, \XMM6, \TMP2 711 pxor \XMM6, \TMP2 712 movdqa HashKey_3(%rsp), \TMP5 713 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 714 movaps 0x30(%arg1), \TMP3 715 AESENC \TMP3, \XMM1 # Round 3 716 AESENC \TMP3, \XMM2 717 AESENC \TMP3, \XMM3 718 AESENC \TMP3, \XMM4 719 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 720 movaps 0x40(%arg1), \TMP3 721 AESENC \TMP3, \XMM1 # Round 4 722 AESENC \TMP3, \XMM2 723 AESENC \TMP3, \XMM3 724 AESENC \TMP3, \XMM4 725 movdqa HashKey_3_k(%rsp), \TMP5 726 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 727 movaps 0x50(%arg1), \TMP3 728 AESENC \TMP3, \XMM1 # Round 5 729 AESENC \TMP3, \XMM2 730 AESENC \TMP3, \XMM3 731 AESENC \TMP3, \XMM4 732 pxor \TMP1, \TMP4 733# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 734 pxor \XMM6, \XMM5 735 pxor \TMP2, \TMP6 736 movdqa \XMM7, \TMP1 737 pshufd $78, \XMM7, \TMP2 738 pxor \XMM7, \TMP2 739 movdqa HashKey_2(%rsp ), \TMP5 740 741 # Multiply TMP5 * HashKey using karatsuba 742 743 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 744 movaps 0x60(%arg1), \TMP3 745 AESENC \TMP3, \XMM1 # Round 6 746 AESENC \TMP3, \XMM2 747 AESENC \TMP3, \XMM3 748 AESENC \TMP3, \XMM4 749 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 750 movaps 0x70(%arg1), \TMP3 751 AESENC \TMP3, \XMM1 # Round 7 752 AESENC \TMP3, \XMM2 753 AESENC \TMP3, \XMM3 754 AESENC \TMP3, \XMM4 755 movdqa HashKey_2_k(%rsp), \TMP5 756 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 757 movaps 0x80(%arg1), \TMP3 758 AESENC \TMP3, \XMM1 # Round 8 759 AESENC \TMP3, \XMM2 760 AESENC \TMP3, \XMM3 761 AESENC \TMP3, \XMM4 762 pxor \TMP1, \TMP4 763# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 764 pxor \XMM7, \XMM5 765 pxor \TMP2, \TMP6 766 767 # Multiply XMM8 * HashKey 768 # XMM8 and TMP5 hold the values for the two operands 769 770 movdqa \XMM8, \TMP1 771 pshufd $78, \XMM8, \TMP2 772 pxor \XMM8, \TMP2 773 movdqa HashKey(%rsp), \TMP5 774 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 775 movaps 0x90(%arg1), \TMP3 776 AESENC \TMP3, \XMM1 # Round 9 777 AESENC \TMP3, \XMM2 778 AESENC \TMP3, \XMM3 779 AESENC \TMP3, \XMM4 780 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 781 lea 0xa0(%arg1),%r10 782 mov keysize,%eax 783 shr $2,%eax # 128->4, 192->6, 256->8 784 sub $4,%eax # 128->0, 192->2, 256->4 785 jz aes_loop_par_enc_done 786 787aes_loop_par_enc: 788 MOVADQ (%r10),\TMP3 789.irpc index, 1234 790 AESENC \TMP3, %xmm\index 791.endr 792 add $16,%r10 793 sub $1,%eax 794 jnz aes_loop_par_enc 795 796aes_loop_par_enc_done: 797 MOVADQ (%r10), \TMP3 798 AESENCLAST \TMP3, \XMM1 # Round 10 799 AESENCLAST \TMP3, \XMM2 800 AESENCLAST \TMP3, \XMM3 801 AESENCLAST \TMP3, \XMM4 802 movdqa HashKey_k(%rsp), \TMP5 803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 804 movdqu (%arg3,%r11,1), \TMP3 805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 806 movdqu 16(%arg3,%r11,1), \TMP3 807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 808 movdqu 32(%arg3,%r11,1), \TMP3 809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 810 movdqu 48(%arg3,%r11,1), \TMP3 811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer 813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer 814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer 815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer 816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 820 821 pxor \TMP4, \TMP1 822 pxor \XMM8, \XMM5 823 pxor \TMP6, \TMP2 824 pxor \TMP1, \TMP2 825 pxor \XMM5, \TMP2 826 movdqa \TMP2, \TMP3 827 pslldq $8, \TMP3 # left shift TMP3 2 DWs 828 psrldq $8, \TMP2 # right shift TMP2 2 DWs 829 pxor \TMP3, \XMM5 830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 831 832 # first phase of reduction 833 834 movdqa \XMM5, \TMP2 835 movdqa \XMM5, \TMP3 836 movdqa \XMM5, \TMP4 837# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 838 pslld $31, \TMP2 # packed right shift << 31 839 pslld $30, \TMP3 # packed right shift << 30 840 pslld $25, \TMP4 # packed right shift << 25 841 pxor \TMP3, \TMP2 # xor the shifted versions 842 pxor \TMP4, \TMP2 843 movdqa \TMP2, \TMP5 844 psrldq $4, \TMP5 # right shift T5 1 DW 845 pslldq $12, \TMP2 # left shift T2 3 DWs 846 pxor \TMP2, \XMM5 847 848 # second phase of reduction 849 850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 851 movdqa \XMM5,\TMP3 852 movdqa \XMM5,\TMP4 853 psrld $1, \TMP2 # packed left shift >>1 854 psrld $2, \TMP3 # packed left shift >>2 855 psrld $7, \TMP4 # packed left shift >>7 856 pxor \TMP3,\TMP2 # xor the shifted versions 857 pxor \TMP4,\TMP2 858 pxor \TMP5, \TMP2 859 pxor \TMP2, \XMM5 860 pxor \TMP1, \XMM5 # result is in TMP1 861 862 pxor \XMM5, \XMM1 863.endm 864 865/* 866* decrypt 4 blocks at a time 867* ghash the 4 previously decrypted ciphertext blocks 868* arg1, %arg2, %arg3 are used as pointers only, not modified 869* %r11 is the data offset value 870*/ 871.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ 872TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 873 874 movdqa \XMM1, \XMM5 875 movdqa \XMM2, \XMM6 876 movdqa \XMM3, \XMM7 877 movdqa \XMM4, \XMM8 878 879 movdqa SHUF_MASK(%rip), %xmm15 880 # multiply TMP5 * HashKey using karatsuba 881 882 movdqa \XMM5, \TMP4 883 pshufd $78, \XMM5, \TMP6 884 pxor \XMM5, \TMP6 885 paddd ONE(%rip), \XMM0 # INCR CNT 886 movdqa HashKey_4(%rsp), \TMP5 887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 888 movdqa \XMM0, \XMM1 889 paddd ONE(%rip), \XMM0 # INCR CNT 890 movdqa \XMM0, \XMM2 891 paddd ONE(%rip), \XMM0 # INCR CNT 892 movdqa \XMM0, \XMM3 893 paddd ONE(%rip), \XMM0 # INCR CNT 894 movdqa \XMM0, \XMM4 895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 900 901 pxor (%arg1), \XMM1 902 pxor (%arg1), \XMM2 903 pxor (%arg1), \XMM3 904 pxor (%arg1), \XMM4 905 movdqa HashKey_4_k(%rsp), \TMP5 906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 907 movaps 0x10(%arg1), \TMP1 908 AESENC \TMP1, \XMM1 # Round 1 909 AESENC \TMP1, \XMM2 910 AESENC \TMP1, \XMM3 911 AESENC \TMP1, \XMM4 912 movaps 0x20(%arg1), \TMP1 913 AESENC \TMP1, \XMM1 # Round 2 914 AESENC \TMP1, \XMM2 915 AESENC \TMP1, \XMM3 916 AESENC \TMP1, \XMM4 917 movdqa \XMM6, \TMP1 918 pshufd $78, \XMM6, \TMP2 919 pxor \XMM6, \TMP2 920 movdqa HashKey_3(%rsp), \TMP5 921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 922 movaps 0x30(%arg1), \TMP3 923 AESENC \TMP3, \XMM1 # Round 3 924 AESENC \TMP3, \XMM2 925 AESENC \TMP3, \XMM3 926 AESENC \TMP3, \XMM4 927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 928 movaps 0x40(%arg1), \TMP3 929 AESENC \TMP3, \XMM1 # Round 4 930 AESENC \TMP3, \XMM2 931 AESENC \TMP3, \XMM3 932 AESENC \TMP3, \XMM4 933 movdqa HashKey_3_k(%rsp), \TMP5 934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 935 movaps 0x50(%arg1), \TMP3 936 AESENC \TMP3, \XMM1 # Round 5 937 AESENC \TMP3, \XMM2 938 AESENC \TMP3, \XMM3 939 AESENC \TMP3, \XMM4 940 pxor \TMP1, \TMP4 941# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 942 pxor \XMM6, \XMM5 943 pxor \TMP2, \TMP6 944 movdqa \XMM7, \TMP1 945 pshufd $78, \XMM7, \TMP2 946 pxor \XMM7, \TMP2 947 movdqa HashKey_2(%rsp ), \TMP5 948 949 # Multiply TMP5 * HashKey using karatsuba 950 951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 952 movaps 0x60(%arg1), \TMP3 953 AESENC \TMP3, \XMM1 # Round 6 954 AESENC \TMP3, \XMM2 955 AESENC \TMP3, \XMM3 956 AESENC \TMP3, \XMM4 957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 958 movaps 0x70(%arg1), \TMP3 959 AESENC \TMP3, \XMM1 # Round 7 960 AESENC \TMP3, \XMM2 961 AESENC \TMP3, \XMM3 962 AESENC \TMP3, \XMM4 963 movdqa HashKey_2_k(%rsp), \TMP5 964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 965 movaps 0x80(%arg1), \TMP3 966 AESENC \TMP3, \XMM1 # Round 8 967 AESENC \TMP3, \XMM2 968 AESENC \TMP3, \XMM3 969 AESENC \TMP3, \XMM4 970 pxor \TMP1, \TMP4 971# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 972 pxor \XMM7, \XMM5 973 pxor \TMP2, \TMP6 974 975 # Multiply XMM8 * HashKey 976 # XMM8 and TMP5 hold the values for the two operands 977 978 movdqa \XMM8, \TMP1 979 pshufd $78, \XMM8, \TMP2 980 pxor \XMM8, \TMP2 981 movdqa HashKey(%rsp), \TMP5 982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 983 movaps 0x90(%arg1), \TMP3 984 AESENC \TMP3, \XMM1 # Round 9 985 AESENC \TMP3, \XMM2 986 AESENC \TMP3, \XMM3 987 AESENC \TMP3, \XMM4 988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 989 lea 0xa0(%arg1),%r10 990 mov keysize,%eax 991 shr $2,%eax # 128->4, 192->6, 256->8 992 sub $4,%eax # 128->0, 192->2, 256->4 993 jz aes_loop_par_dec_done 994 995aes_loop_par_dec: 996 MOVADQ (%r10),\TMP3 997.irpc index, 1234 998 AESENC \TMP3, %xmm\index 999.endr 1000 add $16,%r10 1001 sub $1,%eax 1002 jnz aes_loop_par_dec 1003 1004aes_loop_par_dec_done: 1005 MOVADQ (%r10), \TMP3 1006 AESENCLAST \TMP3, \XMM1 # last round 1007 AESENCLAST \TMP3, \XMM2 1008 AESENCLAST \TMP3, \XMM3 1009 AESENCLAST \TMP3, \XMM4 1010 movdqa HashKey_k(%rsp), \TMP5 1011 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1012 movdqu (%arg3,%r11,1), \TMP3 1013 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1014 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer 1015 movdqa \TMP3, \XMM1 1016 movdqu 16(%arg3,%r11,1), \TMP3 1017 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1018 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer 1019 movdqa \TMP3, \XMM2 1020 movdqu 32(%arg3,%r11,1), \TMP3 1021 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1022 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer 1023 movdqa \TMP3, \XMM3 1024 movdqu 48(%arg3,%r11,1), \TMP3 1025 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1026 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer 1027 movdqa \TMP3, \XMM4 1028 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap 1029 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap 1030 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap 1031 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap 1032 1033 pxor \TMP4, \TMP1 1034 pxor \XMM8, \XMM5 1035 pxor \TMP6, \TMP2 1036 pxor \TMP1, \TMP2 1037 pxor \XMM5, \TMP2 1038 movdqa \TMP2, \TMP3 1039 pslldq $8, \TMP3 # left shift TMP3 2 DWs 1040 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1041 pxor \TMP3, \XMM5 1042 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1043 1044 # first phase of reduction 1045 1046 movdqa \XMM5, \TMP2 1047 movdqa \XMM5, \TMP3 1048 movdqa \XMM5, \TMP4 1049# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1050 pslld $31, \TMP2 # packed right shift << 31 1051 pslld $30, \TMP3 # packed right shift << 30 1052 pslld $25, \TMP4 # packed right shift << 25 1053 pxor \TMP3, \TMP2 # xor the shifted versions 1054 pxor \TMP4, \TMP2 1055 movdqa \TMP2, \TMP5 1056 psrldq $4, \TMP5 # right shift T5 1 DW 1057 pslldq $12, \TMP2 # left shift T2 3 DWs 1058 pxor \TMP2, \XMM5 1059 1060 # second phase of reduction 1061 1062 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1063 movdqa \XMM5,\TMP3 1064 movdqa \XMM5,\TMP4 1065 psrld $1, \TMP2 # packed left shift >>1 1066 psrld $2, \TMP3 # packed left shift >>2 1067 psrld $7, \TMP4 # packed left shift >>7 1068 pxor \TMP3,\TMP2 # xor the shifted versions 1069 pxor \TMP4,\TMP2 1070 pxor \TMP5, \TMP2 1071 pxor \TMP2, \XMM5 1072 pxor \TMP1, \XMM5 # result is in TMP1 1073 1074 pxor \XMM5, \XMM1 1075.endm 1076 1077/* GHASH the last 4 ciphertext blocks. */ 1078.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1079TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1080 1081 # Multiply TMP6 * HashKey (using Karatsuba) 1082 1083 movdqa \XMM1, \TMP6 1084 pshufd $78, \XMM1, \TMP2 1085 pxor \XMM1, \TMP2 1086 movdqa HashKey_4(%rsp), \TMP5 1087 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1088 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1089 movdqa HashKey_4_k(%rsp), \TMP4 1090 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1091 movdqa \XMM1, \XMMDst 1092 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1093 1094 # Multiply TMP1 * HashKey (using Karatsuba) 1095 1096 movdqa \XMM2, \TMP1 1097 pshufd $78, \XMM2, \TMP2 1098 pxor \XMM2, \TMP2 1099 movdqa HashKey_3(%rsp), \TMP5 1100 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1101 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1102 movdqa HashKey_3_k(%rsp), \TMP4 1103 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1104 pxor \TMP1, \TMP6 1105 pxor \XMM2, \XMMDst 1106 pxor \TMP2, \XMM1 1107# results accumulated in TMP6, XMMDst, XMM1 1108 1109 # Multiply TMP1 * HashKey (using Karatsuba) 1110 1111 movdqa \XMM3, \TMP1 1112 pshufd $78, \XMM3, \TMP2 1113 pxor \XMM3, \TMP2 1114 movdqa HashKey_2(%rsp), \TMP5 1115 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1116 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1117 movdqa HashKey_2_k(%rsp), \TMP4 1118 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1119 pxor \TMP1, \TMP6 1120 pxor \XMM3, \XMMDst 1121 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1122 1123 # Multiply TMP1 * HashKey (using Karatsuba) 1124 movdqa \XMM4, \TMP1 1125 pshufd $78, \XMM4, \TMP2 1126 pxor \XMM4, \TMP2 1127 movdqa HashKey(%rsp), \TMP5 1128 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1129 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1130 movdqa HashKey_k(%rsp), \TMP4 1131 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1132 pxor \TMP1, \TMP6 1133 pxor \XMM4, \XMMDst 1134 pxor \XMM1, \TMP2 1135 pxor \TMP6, \TMP2 1136 pxor \XMMDst, \TMP2 1137 # middle section of the temp results combined as in karatsuba algorithm 1138 movdqa \TMP2, \TMP4 1139 pslldq $8, \TMP4 # left shift TMP4 2 DWs 1140 psrldq $8, \TMP2 # right shift TMP2 2 DWs 1141 pxor \TMP4, \XMMDst 1142 pxor \TMP2, \TMP6 1143# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1144 # first phase of the reduction 1145 movdqa \XMMDst, \TMP2 1146 movdqa \XMMDst, \TMP3 1147 movdqa \XMMDst, \TMP4 1148# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1149 pslld $31, \TMP2 # packed right shifting << 31 1150 pslld $30, \TMP3 # packed right shifting << 30 1151 pslld $25, \TMP4 # packed right shifting << 25 1152 pxor \TMP3, \TMP2 # xor the shifted versions 1153 pxor \TMP4, \TMP2 1154 movdqa \TMP2, \TMP7 1155 psrldq $4, \TMP7 # right shift TMP7 1 DW 1156 pslldq $12, \TMP2 # left shift TMP2 3 DWs 1157 pxor \TMP2, \XMMDst 1158 1159 # second phase of the reduction 1160 movdqa \XMMDst, \TMP2 1161 # make 3 copies of XMMDst for doing 3 shift operations 1162 movdqa \XMMDst, \TMP3 1163 movdqa \XMMDst, \TMP4 1164 psrld $1, \TMP2 # packed left shift >> 1 1165 psrld $2, \TMP3 # packed left shift >> 2 1166 psrld $7, \TMP4 # packed left shift >> 7 1167 pxor \TMP3, \TMP2 # xor the shifted versions 1168 pxor \TMP4, \TMP2 1169 pxor \TMP7, \TMP2 1170 pxor \TMP2, \XMMDst 1171 pxor \TMP6, \XMMDst # reduced result is in XMMDst 1172.endm 1173 1174 1175/* Encryption of a single block 1176* uses eax & r10 1177*/ 1178 1179.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1180 1181 pxor (%arg1), \XMM0 1182 mov keysize,%eax 1183 shr $2,%eax # 128->4, 192->6, 256->8 1184 add $5,%eax # 128->9, 192->11, 256->13 1185 lea 16(%arg1), %r10 # get first expanded key address 1186 1187_esb_loop_\@: 1188 MOVADQ (%r10),\TMP1 1189 AESENC \TMP1,\XMM0 1190 add $16,%r10 1191 sub $1,%eax 1192 jnz _esb_loop_\@ 1193 1194 MOVADQ (%r10),\TMP1 1195 AESENCLAST \TMP1,\XMM0 1196.endm 1197/***************************************************************************** 1198* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1199* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1200* const u8 *in, // Ciphertext input 1201* u64 plaintext_len, // Length of data in bytes for decryption. 1202* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1203* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1204* // concatenated with 0x00000001. 16-byte aligned pointer. 1205* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1206* const u8 *aad, // Additional Authentication Data (AAD) 1207* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1208* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1209* // given authentication tag and only return the plaintext if they match. 1210* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1211* // (most likely), 12 or 8. 1212* 1213* Assumptions: 1214* 1215* keys: 1216* keys are pre-expanded and aligned to 16 bytes. we are using the first 1217* set of 11 keys in the data structure void *aes_ctx 1218* 1219* iv: 1220* 0 1 2 3 1221* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1222* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1223* | Salt (From the SA) | 1224* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1225* | Initialization Vector | 1226* | (This is the sequence number from IPSec header) | 1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1228* | 0x1 | 1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1230* 1231* 1232* 1233* AAD: 1234* AAD padded to 128 bits with 0 1235* for example, assume AAD is a u32 vector 1236* 1237* if AAD is 8 bytes: 1238* AAD[3] = {A0, A1}; 1239* padded AAD in xmm register = {A1 A0 0 0} 1240* 1241* 0 1 2 3 1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1244* | SPI (A1) | 1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1246* | 32-bit Sequence Number (A0) | 1247* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1248* | 0x0 | 1249* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1250* 1251* AAD Format with 32-bit Sequence Number 1252* 1253* if AAD is 12 bytes: 1254* AAD[3] = {A0, A1, A2}; 1255* padded AAD in xmm register = {A2 A1 A0 0} 1256* 1257* 0 1 2 3 1258* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1259* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1260* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1261* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1262* | SPI (A2) | 1263* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1264* | 64-bit Extended Sequence Number {A1,A0} | 1265* | | 1266* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1267* | 0x0 | 1268* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1269* 1270* AAD Format with 64-bit Extended Sequence Number 1271* 1272* aadLen: 1273* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1274* The code supports 16 too but for other sizes, the code will fail. 1275* 1276* TLen: 1277* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1278* For other sizes, the code will fail. 1279* 1280* poly = x^128 + x^127 + x^126 + x^121 + 1 1281* 1282*****************************************************************************/ 1283ENTRY(aesni_gcm_dec) 1284 push %r12 1285 push %r13 1286 push %r14 1287 mov %rsp, %r14 1288/* 1289* states of %xmm registers %xmm6:%xmm15 not saved 1290* all %xmm registers are clobbered 1291*/ 1292 sub $VARIABLE_OFFSET, %rsp 1293 and $~63, %rsp # align rsp to 64 bytes 1294 mov %arg6, %r12 1295 movdqu (%r12), %xmm13 # %xmm13 = HashKey 1296 movdqa SHUF_MASK(%rip), %xmm2 1297 PSHUFB_XMM %xmm2, %xmm13 1298 1299 1300# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) 1301 1302 movdqa %xmm13, %xmm2 1303 psllq $1, %xmm13 1304 psrlq $63, %xmm2 1305 movdqa %xmm2, %xmm1 1306 pslldq $8, %xmm2 1307 psrldq $8, %xmm1 1308 por %xmm2, %xmm13 1309 1310 # Reduction 1311 1312 pshufd $0x24, %xmm1, %xmm2 1313 pcmpeqd TWOONE(%rip), %xmm2 1314 pand POLY(%rip), %xmm2 1315 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) 1316 1317 1318 # Decrypt first few blocks 1319 1320 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) 1321 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext 1322 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 1323 mov %r13, %r12 1324 and $(3<<4), %r12 1325 jz _initial_num_blocks_is_0_decrypt 1326 cmp $(2<<4), %r12 1327 jb _initial_num_blocks_is_1_decrypt 1328 je _initial_num_blocks_is_2_decrypt 1329_initial_num_blocks_is_3_decrypt: 1330 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1331%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec 1332 sub $48, %r13 1333 jmp _initial_blocks_decrypted 1334_initial_num_blocks_is_2_decrypt: 1335 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1336%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec 1337 sub $32, %r13 1338 jmp _initial_blocks_decrypted 1339_initial_num_blocks_is_1_decrypt: 1340 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1341%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec 1342 sub $16, %r13 1343 jmp _initial_blocks_decrypted 1344_initial_num_blocks_is_0_decrypt: 1345 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1346%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec 1347_initial_blocks_decrypted: 1348 cmp $0, %r13 1349 je _zero_cipher_left_decrypt 1350 sub $64, %r13 1351 je _four_cipher_left_decrypt 1352_decrypt_by_4: 1353 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1354%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec 1355 add $64, %r11 1356 sub $64, %r13 1357 jne _decrypt_by_4 1358_four_cipher_left_decrypt: 1359 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1360%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1361_zero_cipher_left_decrypt: 1362 mov %arg4, %r13 1363 and $15, %r13 # %r13 = arg4 (mod 16) 1364 je _multiple_of_16_bytes_decrypt 1365 1366 # Handle the last <16 byte block separately 1367 1368 paddd ONE(%rip), %xmm0 # increment CNT to get Yn 1369 movdqa SHUF_MASK(%rip), %xmm10 1370 PSHUFB_XMM %xmm10, %xmm0 1371 1372 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) 1373 sub $16, %r11 1374 add %r13, %r11 1375 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block 1376 lea SHIFT_MASK+16(%rip), %r12 1377 sub %r13, %r12 1378# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes 1379# (%r13 is the number of bytes in plaintext mod 16) 1380 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1381 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes 1382 1383 movdqa %xmm1, %xmm2 1384 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) 1385 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1386 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 1387 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 1388 pand %xmm1, %xmm2 1389 movdqa SHUF_MASK(%rip), %xmm10 1390 PSHUFB_XMM %xmm10 ,%xmm2 1391 1392 pxor %xmm2, %xmm8 1393 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1394 # GHASH computation for the last <16 byte block 1395 sub %r13, %r11 1396 add $16, %r11 1397 1398 # output %r13 bytes 1399 MOVQ_R64_XMM %xmm0, %rax 1400 cmp $8, %r13 1401 jle _less_than_8_bytes_left_decrypt 1402 mov %rax, (%arg2 , %r11, 1) 1403 add $8, %r11 1404 psrldq $8, %xmm0 1405 MOVQ_R64_XMM %xmm0, %rax 1406 sub $8, %r13 1407_less_than_8_bytes_left_decrypt: 1408 mov %al, (%arg2, %r11, 1) 1409 add $1, %r11 1410 shr $8, %rax 1411 sub $1, %r13 1412 jne _less_than_8_bytes_left_decrypt 1413_multiple_of_16_bytes_decrypt: 1414 mov arg8, %r12 # %r13 = aadLen (number of bytes) 1415 shl $3, %r12 # convert into number of bits 1416 movd %r12d, %xmm15 # len(A) in %xmm15 1417 shl $3, %arg4 # len(C) in bits (*128) 1418 MOVQ_R64_XMM %arg4, %xmm1 1419 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1420 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1421 pxor %xmm15, %xmm8 1422 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1423 # final GHASH computation 1424 movdqa SHUF_MASK(%rip), %xmm10 1425 PSHUFB_XMM %xmm10, %xmm8 1426 1427 mov %arg5, %rax # %rax = *Y0 1428 movdqu (%rax), %xmm0 # %xmm0 = Y0 1429 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 1430 pxor %xmm8, %xmm0 1431_return_T_decrypt: 1432 mov arg9, %r10 # %r10 = authTag 1433 mov arg10, %r11 # %r11 = auth_tag_len 1434 cmp $16, %r11 1435 je _T_16_decrypt 1436 cmp $12, %r11 1437 je _T_12_decrypt 1438_T_8_decrypt: 1439 MOVQ_R64_XMM %xmm0, %rax 1440 mov %rax, (%r10) 1441 jmp _return_T_done_decrypt 1442_T_12_decrypt: 1443 MOVQ_R64_XMM %xmm0, %rax 1444 mov %rax, (%r10) 1445 psrldq $8, %xmm0 1446 movd %xmm0, %eax 1447 mov %eax, 8(%r10) 1448 jmp _return_T_done_decrypt 1449_T_16_decrypt: 1450 movdqu %xmm0, (%r10) 1451_return_T_done_decrypt: 1452 mov %r14, %rsp 1453 pop %r14 1454 pop %r13 1455 pop %r12 1456 ret 1457ENDPROC(aesni_gcm_dec) 1458 1459 1460/***************************************************************************** 1461* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1462* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1463* const u8 *in, // Plaintext input 1464* u64 plaintext_len, // Length of data in bytes for encryption. 1465* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1466* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1467* // concatenated with 0x00000001. 16-byte aligned pointer. 1468* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1469* const u8 *aad, // Additional Authentication Data (AAD) 1470* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1471* u8 *auth_tag, // Authenticated Tag output. 1472* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1473* // 12 or 8. 1474* 1475* Assumptions: 1476* 1477* keys: 1478* keys are pre-expanded and aligned to 16 bytes. we are using the 1479* first set of 11 keys in the data structure void *aes_ctx 1480* 1481* 1482* iv: 1483* 0 1 2 3 1484* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1485* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1486* | Salt (From the SA) | 1487* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1488* | Initialization Vector | 1489* | (This is the sequence number from IPSec header) | 1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1491* | 0x1 | 1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1493* 1494* 1495* 1496* AAD: 1497* AAD padded to 128 bits with 0 1498* for example, assume AAD is a u32 vector 1499* 1500* if AAD is 8 bytes: 1501* AAD[3] = {A0, A1}; 1502* padded AAD in xmm register = {A1 A0 0 0} 1503* 1504* 0 1 2 3 1505* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1507* | SPI (A1) | 1508* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1509* | 32-bit Sequence Number (A0) | 1510* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1511* | 0x0 | 1512* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1513* 1514* AAD Format with 32-bit Sequence Number 1515* 1516* if AAD is 12 bytes: 1517* AAD[3] = {A0, A1, A2}; 1518* padded AAD in xmm register = {A2 A1 A0 0} 1519* 1520* 0 1 2 3 1521* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1522* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1523* | SPI (A2) | 1524* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1525* | 64-bit Extended Sequence Number {A1,A0} | 1526* | | 1527* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1528* | 0x0 | 1529* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1530* 1531* AAD Format with 64-bit Extended Sequence Number 1532* 1533* aadLen: 1534* from the definition of the spec, aadLen can only be 8 or 12 bytes. 1535* The code supports 16 too but for other sizes, the code will fail. 1536* 1537* TLen: 1538* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 1539* For other sizes, the code will fail. 1540* 1541* poly = x^128 + x^127 + x^126 + x^121 + 1 1542***************************************************************************/ 1543ENTRY(aesni_gcm_enc) 1544 push %r12 1545 push %r13 1546 push %r14 1547 mov %rsp, %r14 1548# 1549# states of %xmm registers %xmm6:%xmm15 not saved 1550# all %xmm registers are clobbered 1551# 1552 sub $VARIABLE_OFFSET, %rsp 1553 and $~63, %rsp 1554 mov %arg6, %r12 1555 movdqu (%r12), %xmm13 1556 movdqa SHUF_MASK(%rip), %xmm2 1557 PSHUFB_XMM %xmm2, %xmm13 1558 1559 1560# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 1561 1562 movdqa %xmm13, %xmm2 1563 psllq $1, %xmm13 1564 psrlq $63, %xmm2 1565 movdqa %xmm2, %xmm1 1566 pslldq $8, %xmm2 1567 psrldq $8, %xmm1 1568 por %xmm2, %xmm13 1569 1570 # reduce HashKey<<1 1571 1572 pshufd $0x24, %xmm1, %xmm2 1573 pcmpeqd TWOONE(%rip), %xmm2 1574 pand POLY(%rip), %xmm2 1575 pxor %xmm2, %xmm13 1576 movdqa %xmm13, HashKey(%rsp) 1577 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) 1578 and $-16, %r13 1579 mov %r13, %r12 1580 1581 # Encrypt first few blocks 1582 1583 and $(3<<4), %r12 1584 jz _initial_num_blocks_is_0_encrypt 1585 cmp $(2<<4), %r12 1586 jb _initial_num_blocks_is_1_encrypt 1587 je _initial_num_blocks_is_2_encrypt 1588_initial_num_blocks_is_3_encrypt: 1589 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1590%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc 1591 sub $48, %r13 1592 jmp _initial_blocks_encrypted 1593_initial_num_blocks_is_2_encrypt: 1594 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1595%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc 1596 sub $32, %r13 1597 jmp _initial_blocks_encrypted 1598_initial_num_blocks_is_1_encrypt: 1599 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1600%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc 1601 sub $16, %r13 1602 jmp _initial_blocks_encrypted 1603_initial_num_blocks_is_0_encrypt: 1604 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 1605%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc 1606_initial_blocks_encrypted: 1607 1608 # Main loop - Encrypt remaining blocks 1609 1610 cmp $0, %r13 1611 je _zero_cipher_left_encrypt 1612 sub $64, %r13 1613 je _four_cipher_left_encrypt 1614_encrypt_by_4_encrypt: 1615 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ 1616%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc 1617 add $64, %r11 1618 sub $64, %r13 1619 jne _encrypt_by_4_encrypt 1620_four_cipher_left_encrypt: 1621 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 1622%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 1623_zero_cipher_left_encrypt: 1624 mov %arg4, %r13 1625 and $15, %r13 # %r13 = arg4 (mod 16) 1626 je _multiple_of_16_bytes_encrypt 1627 1628 # Handle the last <16 Byte block separately 1629 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 1630 movdqa SHUF_MASK(%rip), %xmm10 1631 PSHUFB_XMM %xmm10, %xmm0 1632 1633 1634 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 1635 sub $16, %r11 1636 add %r13, %r11 1637 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks 1638 lea SHIFT_MASK+16(%rip), %r12 1639 sub %r13, %r12 1640 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 1641 # (%r13 is the number of bytes in plaintext mod 16) 1642 movdqu (%r12), %xmm2 # get the appropriate shuffle mask 1643 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte 1644 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) 1645 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 1646 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 1647 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 1648 movdqa SHUF_MASK(%rip), %xmm10 1649 PSHUFB_XMM %xmm10,%xmm0 1650 1651 pxor %xmm0, %xmm8 1652 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1653 # GHASH computation for the last <16 byte block 1654 sub %r13, %r11 1655 add $16, %r11 1656 1657 movdqa SHUF_MASK(%rip), %xmm10 1658 PSHUFB_XMM %xmm10, %xmm0 1659 1660 # shuffle xmm0 back to output as ciphertext 1661 1662 # Output %r13 bytes 1663 MOVQ_R64_XMM %xmm0, %rax 1664 cmp $8, %r13 1665 jle _less_than_8_bytes_left_encrypt 1666 mov %rax, (%arg2 , %r11, 1) 1667 add $8, %r11 1668 psrldq $8, %xmm0 1669 MOVQ_R64_XMM %xmm0, %rax 1670 sub $8, %r13 1671_less_than_8_bytes_left_encrypt: 1672 mov %al, (%arg2, %r11, 1) 1673 add $1, %r11 1674 shr $8, %rax 1675 sub $1, %r13 1676 jne _less_than_8_bytes_left_encrypt 1677_multiple_of_16_bytes_encrypt: 1678 mov arg8, %r12 # %r12 = addLen (number of bytes) 1679 shl $3, %r12 1680 movd %r12d, %xmm15 # len(A) in %xmm15 1681 shl $3, %arg4 # len(C) in bits (*128) 1682 MOVQ_R64_XMM %arg4, %xmm1 1683 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 1684 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 1685 pxor %xmm15, %xmm8 1686 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 1687 # final GHASH computation 1688 movdqa SHUF_MASK(%rip), %xmm10 1689 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap 1690 1691 mov %arg5, %rax # %rax = *Y0 1692 movdqu (%rax), %xmm0 # %xmm0 = Y0 1693 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) 1694 pxor %xmm8, %xmm0 1695_return_T_encrypt: 1696 mov arg9, %r10 # %r10 = authTag 1697 mov arg10, %r11 # %r11 = auth_tag_len 1698 cmp $16, %r11 1699 je _T_16_encrypt 1700 cmp $12, %r11 1701 je _T_12_encrypt 1702_T_8_encrypt: 1703 MOVQ_R64_XMM %xmm0, %rax 1704 mov %rax, (%r10) 1705 jmp _return_T_done_encrypt 1706_T_12_encrypt: 1707 MOVQ_R64_XMM %xmm0, %rax 1708 mov %rax, (%r10) 1709 psrldq $8, %xmm0 1710 movd %xmm0, %eax 1711 mov %eax, 8(%r10) 1712 jmp _return_T_done_encrypt 1713_T_16_encrypt: 1714 movdqu %xmm0, (%r10) 1715_return_T_done_encrypt: 1716 mov %r14, %rsp 1717 pop %r14 1718 pop %r13 1719 pop %r12 1720 ret 1721ENDPROC(aesni_gcm_enc) 1722 1723#endif 1724 1725 1726.align 4 1727_key_expansion_128: 1728_key_expansion_256a: 1729 pshufd $0b11111111, %xmm1, %xmm1 1730 shufps $0b00010000, %xmm0, %xmm4 1731 pxor %xmm4, %xmm0 1732 shufps $0b10001100, %xmm0, %xmm4 1733 pxor %xmm4, %xmm0 1734 pxor %xmm1, %xmm0 1735 movaps %xmm0, (TKEYP) 1736 add $0x10, TKEYP 1737 ret 1738ENDPROC(_key_expansion_128) 1739ENDPROC(_key_expansion_256a) 1740 1741.align 4 1742_key_expansion_192a: 1743 pshufd $0b01010101, %xmm1, %xmm1 1744 shufps $0b00010000, %xmm0, %xmm4 1745 pxor %xmm4, %xmm0 1746 shufps $0b10001100, %xmm0, %xmm4 1747 pxor %xmm4, %xmm0 1748 pxor %xmm1, %xmm0 1749 1750 movaps %xmm2, %xmm5 1751 movaps %xmm2, %xmm6 1752 pslldq $4, %xmm5 1753 pshufd $0b11111111, %xmm0, %xmm3 1754 pxor %xmm3, %xmm2 1755 pxor %xmm5, %xmm2 1756 1757 movaps %xmm0, %xmm1 1758 shufps $0b01000100, %xmm0, %xmm6 1759 movaps %xmm6, (TKEYP) 1760 shufps $0b01001110, %xmm2, %xmm1 1761 movaps %xmm1, 0x10(TKEYP) 1762 add $0x20, TKEYP 1763 ret 1764ENDPROC(_key_expansion_192a) 1765 1766.align 4 1767_key_expansion_192b: 1768 pshufd $0b01010101, %xmm1, %xmm1 1769 shufps $0b00010000, %xmm0, %xmm4 1770 pxor %xmm4, %xmm0 1771 shufps $0b10001100, %xmm0, %xmm4 1772 pxor %xmm4, %xmm0 1773 pxor %xmm1, %xmm0 1774 1775 movaps %xmm2, %xmm5 1776 pslldq $4, %xmm5 1777 pshufd $0b11111111, %xmm0, %xmm3 1778 pxor %xmm3, %xmm2 1779 pxor %xmm5, %xmm2 1780 1781 movaps %xmm0, (TKEYP) 1782 add $0x10, TKEYP 1783 ret 1784ENDPROC(_key_expansion_192b) 1785 1786.align 4 1787_key_expansion_256b: 1788 pshufd $0b10101010, %xmm1, %xmm1 1789 shufps $0b00010000, %xmm2, %xmm4 1790 pxor %xmm4, %xmm2 1791 shufps $0b10001100, %xmm2, %xmm4 1792 pxor %xmm4, %xmm2 1793 pxor %xmm1, %xmm2 1794 movaps %xmm2, (TKEYP) 1795 add $0x10, TKEYP 1796 ret 1797ENDPROC(_key_expansion_256b) 1798 1799/* 1800 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1801 * unsigned int key_len) 1802 */ 1803ENTRY(aesni_set_key) 1804#ifndef __x86_64__ 1805 pushl KEYP 1806 movl 8(%esp), KEYP # ctx 1807 movl 12(%esp), UKEYP # in_key 1808 movl 16(%esp), %edx # key_len 1809#endif 1810 movups (UKEYP), %xmm0 # user key (first 16 bytes) 1811 movaps %xmm0, (KEYP) 1812 lea 0x10(KEYP), TKEYP # key addr 1813 movl %edx, 480(KEYP) 1814 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1815 cmp $24, %dl 1816 jb .Lenc_key128 1817 je .Lenc_key192 1818 movups 0x10(UKEYP), %xmm2 # other user key 1819 movaps %xmm2, (TKEYP) 1820 add $0x10, TKEYP 1821 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1822 call _key_expansion_256a 1823 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1824 call _key_expansion_256b 1825 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1826 call _key_expansion_256a 1827 AESKEYGENASSIST 0x2 %xmm0 %xmm1 1828 call _key_expansion_256b 1829 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1830 call _key_expansion_256a 1831 AESKEYGENASSIST 0x4 %xmm0 %xmm1 1832 call _key_expansion_256b 1833 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1834 call _key_expansion_256a 1835 AESKEYGENASSIST 0x8 %xmm0 %xmm1 1836 call _key_expansion_256b 1837 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1838 call _key_expansion_256a 1839 AESKEYGENASSIST 0x10 %xmm0 %xmm1 1840 call _key_expansion_256b 1841 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1842 call _key_expansion_256a 1843 AESKEYGENASSIST 0x20 %xmm0 %xmm1 1844 call _key_expansion_256b 1845 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1846 call _key_expansion_256a 1847 jmp .Ldec_key 1848.Lenc_key192: 1849 movq 0x10(UKEYP), %xmm2 # other user key 1850 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1851 call _key_expansion_192a 1852 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1853 call _key_expansion_192b 1854 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 1855 call _key_expansion_192a 1856 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 1857 call _key_expansion_192b 1858 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 1859 call _key_expansion_192a 1860 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 1861 call _key_expansion_192b 1862 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 1863 call _key_expansion_192a 1864 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 1865 call _key_expansion_192b 1866 jmp .Ldec_key 1867.Lenc_key128: 1868 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 1869 call _key_expansion_128 1870 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 1871 call _key_expansion_128 1872 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 1873 call _key_expansion_128 1874 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 1875 call _key_expansion_128 1876 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 1877 call _key_expansion_128 1878 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 1879 call _key_expansion_128 1880 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 1881 call _key_expansion_128 1882 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 1883 call _key_expansion_128 1884 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 1885 call _key_expansion_128 1886 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1887 call _key_expansion_128 1888.Ldec_key: 1889 sub $0x10, TKEYP 1890 movaps (KEYP), %xmm0 1891 movaps (TKEYP), %xmm1 1892 movaps %xmm0, 240(TKEYP) 1893 movaps %xmm1, 240(KEYP) 1894 add $0x10, KEYP 1895 lea 240-16(TKEYP), UKEYP 1896.align 4 1897.Ldec_key_loop: 1898 movaps (KEYP), %xmm0 1899 AESIMC %xmm0 %xmm1 1900 movaps %xmm1, (UKEYP) 1901 add $0x10, KEYP 1902 sub $0x10, UKEYP 1903 cmp TKEYP, KEYP 1904 jb .Ldec_key_loop 1905 xor AREG, AREG 1906#ifndef __x86_64__ 1907 popl KEYP 1908#endif 1909 ret 1910ENDPROC(aesni_set_key) 1911 1912/* 1913 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1914 */ 1915ENTRY(aesni_enc) 1916#ifndef __x86_64__ 1917 pushl KEYP 1918 pushl KLEN 1919 movl 12(%esp), KEYP 1920 movl 16(%esp), OUTP 1921 movl 20(%esp), INP 1922#endif 1923 movl 480(KEYP), KLEN # key length 1924 movups (INP), STATE # input 1925 call _aesni_enc1 1926 movups STATE, (OUTP) # output 1927#ifndef __x86_64__ 1928 popl KLEN 1929 popl KEYP 1930#endif 1931 ret 1932ENDPROC(aesni_enc) 1933 1934/* 1935 * _aesni_enc1: internal ABI 1936 * input: 1937 * KEYP: key struct pointer 1938 * KLEN: round count 1939 * STATE: initial state (input) 1940 * output: 1941 * STATE: finial state (output) 1942 * changed: 1943 * KEY 1944 * TKEYP (T1) 1945 */ 1946.align 4 1947_aesni_enc1: 1948 movaps (KEYP), KEY # key 1949 mov KEYP, TKEYP 1950 pxor KEY, STATE # round 0 1951 add $0x30, TKEYP 1952 cmp $24, KLEN 1953 jb .Lenc128 1954 lea 0x20(TKEYP), TKEYP 1955 je .Lenc192 1956 add $0x20, TKEYP 1957 movaps -0x60(TKEYP), KEY 1958 AESENC KEY STATE 1959 movaps -0x50(TKEYP), KEY 1960 AESENC KEY STATE 1961.align 4 1962.Lenc192: 1963 movaps -0x40(TKEYP), KEY 1964 AESENC KEY STATE 1965 movaps -0x30(TKEYP), KEY 1966 AESENC KEY STATE 1967.align 4 1968.Lenc128: 1969 movaps -0x20(TKEYP), KEY 1970 AESENC KEY STATE 1971 movaps -0x10(TKEYP), KEY 1972 AESENC KEY STATE 1973 movaps (TKEYP), KEY 1974 AESENC KEY STATE 1975 movaps 0x10(TKEYP), KEY 1976 AESENC KEY STATE 1977 movaps 0x20(TKEYP), KEY 1978 AESENC KEY STATE 1979 movaps 0x30(TKEYP), KEY 1980 AESENC KEY STATE 1981 movaps 0x40(TKEYP), KEY 1982 AESENC KEY STATE 1983 movaps 0x50(TKEYP), KEY 1984 AESENC KEY STATE 1985 movaps 0x60(TKEYP), KEY 1986 AESENC KEY STATE 1987 movaps 0x70(TKEYP), KEY 1988 AESENCLAST KEY STATE 1989 ret 1990ENDPROC(_aesni_enc1) 1991 1992/* 1993 * _aesni_enc4: internal ABI 1994 * input: 1995 * KEYP: key struct pointer 1996 * KLEN: round count 1997 * STATE1: initial state (input) 1998 * STATE2 1999 * STATE3 2000 * STATE4 2001 * output: 2002 * STATE1: finial state (output) 2003 * STATE2 2004 * STATE3 2005 * STATE4 2006 * changed: 2007 * KEY 2008 * TKEYP (T1) 2009 */ 2010.align 4 2011_aesni_enc4: 2012 movaps (KEYP), KEY # key 2013 mov KEYP, TKEYP 2014 pxor KEY, STATE1 # round 0 2015 pxor KEY, STATE2 2016 pxor KEY, STATE3 2017 pxor KEY, STATE4 2018 add $0x30, TKEYP 2019 cmp $24, KLEN 2020 jb .L4enc128 2021 lea 0x20(TKEYP), TKEYP 2022 je .L4enc192 2023 add $0x20, TKEYP 2024 movaps -0x60(TKEYP), KEY 2025 AESENC KEY STATE1 2026 AESENC KEY STATE2 2027 AESENC KEY STATE3 2028 AESENC KEY STATE4 2029 movaps -0x50(TKEYP), KEY 2030 AESENC KEY STATE1 2031 AESENC KEY STATE2 2032 AESENC KEY STATE3 2033 AESENC KEY STATE4 2034#.align 4 2035.L4enc192: 2036 movaps -0x40(TKEYP), KEY 2037 AESENC KEY STATE1 2038 AESENC KEY STATE2 2039 AESENC KEY STATE3 2040 AESENC KEY STATE4 2041 movaps -0x30(TKEYP), KEY 2042 AESENC KEY STATE1 2043 AESENC KEY STATE2 2044 AESENC KEY STATE3 2045 AESENC KEY STATE4 2046#.align 4 2047.L4enc128: 2048 movaps -0x20(TKEYP), KEY 2049 AESENC KEY STATE1 2050 AESENC KEY STATE2 2051 AESENC KEY STATE3 2052 AESENC KEY STATE4 2053 movaps -0x10(TKEYP), KEY 2054 AESENC KEY STATE1 2055 AESENC KEY STATE2 2056 AESENC KEY STATE3 2057 AESENC KEY STATE4 2058 movaps (TKEYP), KEY 2059 AESENC KEY STATE1 2060 AESENC KEY STATE2 2061 AESENC KEY STATE3 2062 AESENC KEY STATE4 2063 movaps 0x10(TKEYP), KEY 2064 AESENC KEY STATE1 2065 AESENC KEY STATE2 2066 AESENC KEY STATE3 2067 AESENC KEY STATE4 2068 movaps 0x20(TKEYP), KEY 2069 AESENC KEY STATE1 2070 AESENC KEY STATE2 2071 AESENC KEY STATE3 2072 AESENC KEY STATE4 2073 movaps 0x30(TKEYP), KEY 2074 AESENC KEY STATE1 2075 AESENC KEY STATE2 2076 AESENC KEY STATE3 2077 AESENC KEY STATE4 2078 movaps 0x40(TKEYP), KEY 2079 AESENC KEY STATE1 2080 AESENC KEY STATE2 2081 AESENC KEY STATE3 2082 AESENC KEY STATE4 2083 movaps 0x50(TKEYP), KEY 2084 AESENC KEY STATE1 2085 AESENC KEY STATE2 2086 AESENC KEY STATE3 2087 AESENC KEY STATE4 2088 movaps 0x60(TKEYP), KEY 2089 AESENC KEY STATE1 2090 AESENC KEY STATE2 2091 AESENC KEY STATE3 2092 AESENC KEY STATE4 2093 movaps 0x70(TKEYP), KEY 2094 AESENCLAST KEY STATE1 # last round 2095 AESENCLAST KEY STATE2 2096 AESENCLAST KEY STATE3 2097 AESENCLAST KEY STATE4 2098 ret 2099ENDPROC(_aesni_enc4) 2100 2101/* 2102 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2103 */ 2104ENTRY(aesni_dec) 2105#ifndef __x86_64__ 2106 pushl KEYP 2107 pushl KLEN 2108 movl 12(%esp), KEYP 2109 movl 16(%esp), OUTP 2110 movl 20(%esp), INP 2111#endif 2112 mov 480(KEYP), KLEN # key length 2113 add $240, KEYP 2114 movups (INP), STATE # input 2115 call _aesni_dec1 2116 movups STATE, (OUTP) #output 2117#ifndef __x86_64__ 2118 popl KLEN 2119 popl KEYP 2120#endif 2121 ret 2122ENDPROC(aesni_dec) 2123 2124/* 2125 * _aesni_dec1: internal ABI 2126 * input: 2127 * KEYP: key struct pointer 2128 * KLEN: key length 2129 * STATE: initial state (input) 2130 * output: 2131 * STATE: finial state (output) 2132 * changed: 2133 * KEY 2134 * TKEYP (T1) 2135 */ 2136.align 4 2137_aesni_dec1: 2138 movaps (KEYP), KEY # key 2139 mov KEYP, TKEYP 2140 pxor KEY, STATE # round 0 2141 add $0x30, TKEYP 2142 cmp $24, KLEN 2143 jb .Ldec128 2144 lea 0x20(TKEYP), TKEYP 2145 je .Ldec192 2146 add $0x20, TKEYP 2147 movaps -0x60(TKEYP), KEY 2148 AESDEC KEY STATE 2149 movaps -0x50(TKEYP), KEY 2150 AESDEC KEY STATE 2151.align 4 2152.Ldec192: 2153 movaps -0x40(TKEYP), KEY 2154 AESDEC KEY STATE 2155 movaps -0x30(TKEYP), KEY 2156 AESDEC KEY STATE 2157.align 4 2158.Ldec128: 2159 movaps -0x20(TKEYP), KEY 2160 AESDEC KEY STATE 2161 movaps -0x10(TKEYP), KEY 2162 AESDEC KEY STATE 2163 movaps (TKEYP), KEY 2164 AESDEC KEY STATE 2165 movaps 0x10(TKEYP), KEY 2166 AESDEC KEY STATE 2167 movaps 0x20(TKEYP), KEY 2168 AESDEC KEY STATE 2169 movaps 0x30(TKEYP), KEY 2170 AESDEC KEY STATE 2171 movaps 0x40(TKEYP), KEY 2172 AESDEC KEY STATE 2173 movaps 0x50(TKEYP), KEY 2174 AESDEC KEY STATE 2175 movaps 0x60(TKEYP), KEY 2176 AESDEC KEY STATE 2177 movaps 0x70(TKEYP), KEY 2178 AESDECLAST KEY STATE 2179 ret 2180ENDPROC(_aesni_dec1) 2181 2182/* 2183 * _aesni_dec4: internal ABI 2184 * input: 2185 * KEYP: key struct pointer 2186 * KLEN: key length 2187 * STATE1: initial state (input) 2188 * STATE2 2189 * STATE3 2190 * STATE4 2191 * output: 2192 * STATE1: finial state (output) 2193 * STATE2 2194 * STATE3 2195 * STATE4 2196 * changed: 2197 * KEY 2198 * TKEYP (T1) 2199 */ 2200.align 4 2201_aesni_dec4: 2202 movaps (KEYP), KEY # key 2203 mov KEYP, TKEYP 2204 pxor KEY, STATE1 # round 0 2205 pxor KEY, STATE2 2206 pxor KEY, STATE3 2207 pxor KEY, STATE4 2208 add $0x30, TKEYP 2209 cmp $24, KLEN 2210 jb .L4dec128 2211 lea 0x20(TKEYP), TKEYP 2212 je .L4dec192 2213 add $0x20, TKEYP 2214 movaps -0x60(TKEYP), KEY 2215 AESDEC KEY STATE1 2216 AESDEC KEY STATE2 2217 AESDEC KEY STATE3 2218 AESDEC KEY STATE4 2219 movaps -0x50(TKEYP), KEY 2220 AESDEC KEY STATE1 2221 AESDEC KEY STATE2 2222 AESDEC KEY STATE3 2223 AESDEC KEY STATE4 2224.align 4 2225.L4dec192: 2226 movaps -0x40(TKEYP), KEY 2227 AESDEC KEY STATE1 2228 AESDEC KEY STATE2 2229 AESDEC KEY STATE3 2230 AESDEC KEY STATE4 2231 movaps -0x30(TKEYP), KEY 2232 AESDEC KEY STATE1 2233 AESDEC KEY STATE2 2234 AESDEC KEY STATE3 2235 AESDEC KEY STATE4 2236.align 4 2237.L4dec128: 2238 movaps -0x20(TKEYP), KEY 2239 AESDEC KEY STATE1 2240 AESDEC KEY STATE2 2241 AESDEC KEY STATE3 2242 AESDEC KEY STATE4 2243 movaps -0x10(TKEYP), KEY 2244 AESDEC KEY STATE1 2245 AESDEC KEY STATE2 2246 AESDEC KEY STATE3 2247 AESDEC KEY STATE4 2248 movaps (TKEYP), KEY 2249 AESDEC KEY STATE1 2250 AESDEC KEY STATE2 2251 AESDEC KEY STATE3 2252 AESDEC KEY STATE4 2253 movaps 0x10(TKEYP), KEY 2254 AESDEC KEY STATE1 2255 AESDEC KEY STATE2 2256 AESDEC KEY STATE3 2257 AESDEC KEY STATE4 2258 movaps 0x20(TKEYP), KEY 2259 AESDEC KEY STATE1 2260 AESDEC KEY STATE2 2261 AESDEC KEY STATE3 2262 AESDEC KEY STATE4 2263 movaps 0x30(TKEYP), KEY 2264 AESDEC KEY STATE1 2265 AESDEC KEY STATE2 2266 AESDEC KEY STATE3 2267 AESDEC KEY STATE4 2268 movaps 0x40(TKEYP), KEY 2269 AESDEC KEY STATE1 2270 AESDEC KEY STATE2 2271 AESDEC KEY STATE3 2272 AESDEC KEY STATE4 2273 movaps 0x50(TKEYP), KEY 2274 AESDEC KEY STATE1 2275 AESDEC KEY STATE2 2276 AESDEC KEY STATE3 2277 AESDEC KEY STATE4 2278 movaps 0x60(TKEYP), KEY 2279 AESDEC KEY STATE1 2280 AESDEC KEY STATE2 2281 AESDEC KEY STATE3 2282 AESDEC KEY STATE4 2283 movaps 0x70(TKEYP), KEY 2284 AESDECLAST KEY STATE1 # last round 2285 AESDECLAST KEY STATE2 2286 AESDECLAST KEY STATE3 2287 AESDECLAST KEY STATE4 2288 ret 2289ENDPROC(_aesni_dec4) 2290 2291/* 2292 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2293 * size_t len) 2294 */ 2295ENTRY(aesni_ecb_enc) 2296#ifndef __x86_64__ 2297 pushl LEN 2298 pushl KEYP 2299 pushl KLEN 2300 movl 16(%esp), KEYP 2301 movl 20(%esp), OUTP 2302 movl 24(%esp), INP 2303 movl 28(%esp), LEN 2304#endif 2305 test LEN, LEN # check length 2306 jz .Lecb_enc_ret 2307 mov 480(KEYP), KLEN 2308 cmp $16, LEN 2309 jb .Lecb_enc_ret 2310 cmp $64, LEN 2311 jb .Lecb_enc_loop1 2312.align 4 2313.Lecb_enc_loop4: 2314 movups (INP), STATE1 2315 movups 0x10(INP), STATE2 2316 movups 0x20(INP), STATE3 2317 movups 0x30(INP), STATE4 2318 call _aesni_enc4 2319 movups STATE1, (OUTP) 2320 movups STATE2, 0x10(OUTP) 2321 movups STATE3, 0x20(OUTP) 2322 movups STATE4, 0x30(OUTP) 2323 sub $64, LEN 2324 add $64, INP 2325 add $64, OUTP 2326 cmp $64, LEN 2327 jge .Lecb_enc_loop4 2328 cmp $16, LEN 2329 jb .Lecb_enc_ret 2330.align 4 2331.Lecb_enc_loop1: 2332 movups (INP), STATE1 2333 call _aesni_enc1 2334 movups STATE1, (OUTP) 2335 sub $16, LEN 2336 add $16, INP 2337 add $16, OUTP 2338 cmp $16, LEN 2339 jge .Lecb_enc_loop1 2340.Lecb_enc_ret: 2341#ifndef __x86_64__ 2342 popl KLEN 2343 popl KEYP 2344 popl LEN 2345#endif 2346 ret 2347ENDPROC(aesni_ecb_enc) 2348 2349/* 2350 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2351 * size_t len); 2352 */ 2353ENTRY(aesni_ecb_dec) 2354#ifndef __x86_64__ 2355 pushl LEN 2356 pushl KEYP 2357 pushl KLEN 2358 movl 16(%esp), KEYP 2359 movl 20(%esp), OUTP 2360 movl 24(%esp), INP 2361 movl 28(%esp), LEN 2362#endif 2363 test LEN, LEN 2364 jz .Lecb_dec_ret 2365 mov 480(KEYP), KLEN 2366 add $240, KEYP 2367 cmp $16, LEN 2368 jb .Lecb_dec_ret 2369 cmp $64, LEN 2370 jb .Lecb_dec_loop1 2371.align 4 2372.Lecb_dec_loop4: 2373 movups (INP), STATE1 2374 movups 0x10(INP), STATE2 2375 movups 0x20(INP), STATE3 2376 movups 0x30(INP), STATE4 2377 call _aesni_dec4 2378 movups STATE1, (OUTP) 2379 movups STATE2, 0x10(OUTP) 2380 movups STATE3, 0x20(OUTP) 2381 movups STATE4, 0x30(OUTP) 2382 sub $64, LEN 2383 add $64, INP 2384 add $64, OUTP 2385 cmp $64, LEN 2386 jge .Lecb_dec_loop4 2387 cmp $16, LEN 2388 jb .Lecb_dec_ret 2389.align 4 2390.Lecb_dec_loop1: 2391 movups (INP), STATE1 2392 call _aesni_dec1 2393 movups STATE1, (OUTP) 2394 sub $16, LEN 2395 add $16, INP 2396 add $16, OUTP 2397 cmp $16, LEN 2398 jge .Lecb_dec_loop1 2399.Lecb_dec_ret: 2400#ifndef __x86_64__ 2401 popl KLEN 2402 popl KEYP 2403 popl LEN 2404#endif 2405 ret 2406ENDPROC(aesni_ecb_dec) 2407 2408/* 2409 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2410 * size_t len, u8 *iv) 2411 */ 2412ENTRY(aesni_cbc_enc) 2413#ifndef __x86_64__ 2414 pushl IVP 2415 pushl LEN 2416 pushl KEYP 2417 pushl KLEN 2418 movl 20(%esp), KEYP 2419 movl 24(%esp), OUTP 2420 movl 28(%esp), INP 2421 movl 32(%esp), LEN 2422 movl 36(%esp), IVP 2423#endif 2424 cmp $16, LEN 2425 jb .Lcbc_enc_ret 2426 mov 480(KEYP), KLEN 2427 movups (IVP), STATE # load iv as initial state 2428.align 4 2429.Lcbc_enc_loop: 2430 movups (INP), IN # load input 2431 pxor IN, STATE 2432 call _aesni_enc1 2433 movups STATE, (OUTP) # store output 2434 sub $16, LEN 2435 add $16, INP 2436 add $16, OUTP 2437 cmp $16, LEN 2438 jge .Lcbc_enc_loop 2439 movups STATE, (IVP) 2440.Lcbc_enc_ret: 2441#ifndef __x86_64__ 2442 popl KLEN 2443 popl KEYP 2444 popl LEN 2445 popl IVP 2446#endif 2447 ret 2448ENDPROC(aesni_cbc_enc) 2449 2450/* 2451 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2452 * size_t len, u8 *iv) 2453 */ 2454ENTRY(aesni_cbc_dec) 2455#ifndef __x86_64__ 2456 pushl IVP 2457 pushl LEN 2458 pushl KEYP 2459 pushl KLEN 2460 movl 20(%esp), KEYP 2461 movl 24(%esp), OUTP 2462 movl 28(%esp), INP 2463 movl 32(%esp), LEN 2464 movl 36(%esp), IVP 2465#endif 2466 cmp $16, LEN 2467 jb .Lcbc_dec_just_ret 2468 mov 480(KEYP), KLEN 2469 add $240, KEYP 2470 movups (IVP), IV 2471 cmp $64, LEN 2472 jb .Lcbc_dec_loop1 2473.align 4 2474.Lcbc_dec_loop4: 2475 movups (INP), IN1 2476 movaps IN1, STATE1 2477 movups 0x10(INP), IN2 2478 movaps IN2, STATE2 2479#ifdef __x86_64__ 2480 movups 0x20(INP), IN3 2481 movaps IN3, STATE3 2482 movups 0x30(INP), IN4 2483 movaps IN4, STATE4 2484#else 2485 movups 0x20(INP), IN1 2486 movaps IN1, STATE3 2487 movups 0x30(INP), IN2 2488 movaps IN2, STATE4 2489#endif 2490 call _aesni_dec4 2491 pxor IV, STATE1 2492#ifdef __x86_64__ 2493 pxor IN1, STATE2 2494 pxor IN2, STATE3 2495 pxor IN3, STATE4 2496 movaps IN4, IV 2497#else 2498 pxor IN1, STATE4 2499 movaps IN2, IV 2500 movups (INP), IN1 2501 pxor IN1, STATE2 2502 movups 0x10(INP), IN2 2503 pxor IN2, STATE3 2504#endif 2505 movups STATE1, (OUTP) 2506 movups STATE2, 0x10(OUTP) 2507 movups STATE3, 0x20(OUTP) 2508 movups STATE4, 0x30(OUTP) 2509 sub $64, LEN 2510 add $64, INP 2511 add $64, OUTP 2512 cmp $64, LEN 2513 jge .Lcbc_dec_loop4 2514 cmp $16, LEN 2515 jb .Lcbc_dec_ret 2516.align 4 2517.Lcbc_dec_loop1: 2518 movups (INP), IN 2519 movaps IN, STATE 2520 call _aesni_dec1 2521 pxor IV, STATE 2522 movups STATE, (OUTP) 2523 movaps IN, IV 2524 sub $16, LEN 2525 add $16, INP 2526 add $16, OUTP 2527 cmp $16, LEN 2528 jge .Lcbc_dec_loop1 2529.Lcbc_dec_ret: 2530 movups IV, (IVP) 2531.Lcbc_dec_just_ret: 2532#ifndef __x86_64__ 2533 popl KLEN 2534 popl KEYP 2535 popl LEN 2536 popl IVP 2537#endif 2538 ret 2539ENDPROC(aesni_cbc_dec) 2540 2541#ifdef __x86_64__ 2542.align 16 2543.Lbswap_mask: 2544 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2545 2546/* 2547 * _aesni_inc_init: internal ABI 2548 * setup registers used by _aesni_inc 2549 * input: 2550 * IV 2551 * output: 2552 * CTR: == IV, in little endian 2553 * TCTR_LOW: == lower qword of CTR 2554 * INC: == 1, in little endian 2555 * BSWAP_MASK == endian swapping mask 2556 */ 2557.align 4 2558_aesni_inc_init: 2559 movaps .Lbswap_mask, BSWAP_MASK 2560 movaps IV, CTR 2561 PSHUFB_XMM BSWAP_MASK CTR 2562 mov $1, TCTR_LOW 2563 MOVQ_R64_XMM TCTR_LOW INC 2564 MOVQ_R64_XMM CTR TCTR_LOW 2565 ret 2566ENDPROC(_aesni_inc_init) 2567 2568/* 2569 * _aesni_inc: internal ABI 2570 * Increase IV by 1, IV is in big endian 2571 * input: 2572 * IV 2573 * CTR: == IV, in little endian 2574 * TCTR_LOW: == lower qword of CTR 2575 * INC: == 1, in little endian 2576 * BSWAP_MASK == endian swapping mask 2577 * output: 2578 * IV: Increase by 1 2579 * changed: 2580 * CTR: == output IV, in little endian 2581 * TCTR_LOW: == lower qword of CTR 2582 */ 2583.align 4 2584_aesni_inc: 2585 paddq INC, CTR 2586 add $1, TCTR_LOW 2587 jnc .Linc_low 2588 pslldq $8, INC 2589 paddq INC, CTR 2590 psrldq $8, INC 2591.Linc_low: 2592 movaps CTR, IV 2593 PSHUFB_XMM BSWAP_MASK IV 2594 ret 2595ENDPROC(_aesni_inc) 2596 2597/* 2598 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2599 * size_t len, u8 *iv) 2600 */ 2601ENTRY(aesni_ctr_enc) 2602 cmp $16, LEN 2603 jb .Lctr_enc_just_ret 2604 mov 480(KEYP), KLEN 2605 movups (IVP), IV 2606 call _aesni_inc_init 2607 cmp $64, LEN 2608 jb .Lctr_enc_loop1 2609.align 4 2610.Lctr_enc_loop4: 2611 movaps IV, STATE1 2612 call _aesni_inc 2613 movups (INP), IN1 2614 movaps IV, STATE2 2615 call _aesni_inc 2616 movups 0x10(INP), IN2 2617 movaps IV, STATE3 2618 call _aesni_inc 2619 movups 0x20(INP), IN3 2620 movaps IV, STATE4 2621 call _aesni_inc 2622 movups 0x30(INP), IN4 2623 call _aesni_enc4 2624 pxor IN1, STATE1 2625 movups STATE1, (OUTP) 2626 pxor IN2, STATE2 2627 movups STATE2, 0x10(OUTP) 2628 pxor IN3, STATE3 2629 movups STATE3, 0x20(OUTP) 2630 pxor IN4, STATE4 2631 movups STATE4, 0x30(OUTP) 2632 sub $64, LEN 2633 add $64, INP 2634 add $64, OUTP 2635 cmp $64, LEN 2636 jge .Lctr_enc_loop4 2637 cmp $16, LEN 2638 jb .Lctr_enc_ret 2639.align 4 2640.Lctr_enc_loop1: 2641 movaps IV, STATE 2642 call _aesni_inc 2643 movups (INP), IN 2644 call _aesni_enc1 2645 pxor IN, STATE 2646 movups STATE, (OUTP) 2647 sub $16, LEN 2648 add $16, INP 2649 add $16, OUTP 2650 cmp $16, LEN 2651 jge .Lctr_enc_loop1 2652.Lctr_enc_ret: 2653 movups IV, (IVP) 2654.Lctr_enc_just_ret: 2655 ret 2656ENDPROC(aesni_ctr_enc) 2657 2658/* 2659 * _aesni_gf128mul_x_ble: internal ABI 2660 * Multiply in GF(2^128) for XTS IVs 2661 * input: 2662 * IV: current IV 2663 * GF128MUL_MASK == mask with 0x87 and 0x01 2664 * output: 2665 * IV: next IV 2666 * changed: 2667 * CTR: == temporary value 2668 */ 2669#define _aesni_gf128mul_x_ble() \ 2670 pshufd $0x13, IV, CTR; \ 2671 paddq IV, IV; \ 2672 psrad $31, CTR; \ 2673 pand GF128MUL_MASK, CTR; \ 2674 pxor CTR, IV; 2675 2676/* 2677 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2678 * bool enc, u8 *iv) 2679 */ 2680ENTRY(aesni_xts_crypt8) 2681 cmpb $0, %cl 2682 movl $0, %ecx 2683 movl $240, %r10d 2684 leaq _aesni_enc4, %r11 2685 leaq _aesni_dec4, %rax 2686 cmovel %r10d, %ecx 2687 cmoveq %rax, %r11 2688 2689 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2690 movups (IVP), IV 2691 2692 mov 480(KEYP), KLEN 2693 addq %rcx, KEYP 2694 2695 movdqa IV, STATE1 2696 movdqu 0x00(INP), INC 2697 pxor INC, STATE1 2698 movdqu IV, 0x00(OUTP) 2699 2700 _aesni_gf128mul_x_ble() 2701 movdqa IV, STATE2 2702 movdqu 0x10(INP), INC 2703 pxor INC, STATE2 2704 movdqu IV, 0x10(OUTP) 2705 2706 _aesni_gf128mul_x_ble() 2707 movdqa IV, STATE3 2708 movdqu 0x20(INP), INC 2709 pxor INC, STATE3 2710 movdqu IV, 0x20(OUTP) 2711 2712 _aesni_gf128mul_x_ble() 2713 movdqa IV, STATE4 2714 movdqu 0x30(INP), INC 2715 pxor INC, STATE4 2716 movdqu IV, 0x30(OUTP) 2717 2718 CALL_NOSPEC %r11 2719 2720 movdqu 0x00(OUTP), INC 2721 pxor INC, STATE1 2722 movdqu STATE1, 0x00(OUTP) 2723 2724 _aesni_gf128mul_x_ble() 2725 movdqa IV, STATE1 2726 movdqu 0x40(INP), INC 2727 pxor INC, STATE1 2728 movdqu IV, 0x40(OUTP) 2729 2730 movdqu 0x10(OUTP), INC 2731 pxor INC, STATE2 2732 movdqu STATE2, 0x10(OUTP) 2733 2734 _aesni_gf128mul_x_ble() 2735 movdqa IV, STATE2 2736 movdqu 0x50(INP), INC 2737 pxor INC, STATE2 2738 movdqu IV, 0x50(OUTP) 2739 2740 movdqu 0x20(OUTP), INC 2741 pxor INC, STATE3 2742 movdqu STATE3, 0x20(OUTP) 2743 2744 _aesni_gf128mul_x_ble() 2745 movdqa IV, STATE3 2746 movdqu 0x60(INP), INC 2747 pxor INC, STATE3 2748 movdqu IV, 0x60(OUTP) 2749 2750 movdqu 0x30(OUTP), INC 2751 pxor INC, STATE4 2752 movdqu STATE4, 0x30(OUTP) 2753 2754 _aesni_gf128mul_x_ble() 2755 movdqa IV, STATE4 2756 movdqu 0x70(INP), INC 2757 pxor INC, STATE4 2758 movdqu IV, 0x70(OUTP) 2759 2760 _aesni_gf128mul_x_ble() 2761 movups IV, (IVP) 2762 2763 CALL_NOSPEC %r11 2764 2765 movdqu 0x40(OUTP), INC 2766 pxor INC, STATE1 2767 movdqu STATE1, 0x40(OUTP) 2768 2769 movdqu 0x50(OUTP), INC 2770 pxor INC, STATE2 2771 movdqu STATE2, 0x50(OUTP) 2772 2773 movdqu 0x60(OUTP), INC 2774 pxor INC, STATE3 2775 movdqu STATE3, 0x60(OUTP) 2776 2777 movdqu 0x70(OUTP), INC 2778 pxor INC, STATE4 2779 movdqu STATE4, 0x70(OUTP) 2780 2781 ret 2782ENDPROC(aesni_xts_crypt8) 2783 2784#endif 2785