1/* 2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3 * 4 * This is AES128/192/256 CTR mode optimization implementation. It requires 5 * the support of Intel(R) AESNI and AVX instructions. 6 * 7 * This work was inspired by the AES CTR mode optimization published 8 * in Intel Optimized IPSEC Cryptograhpic library. 9 * Additional information on it can be found at: 10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11 * 12 * This file is provided under a dual BSD/GPLv2 license. When using or 13 * redistributing this file, you may do so under either license. 14 * 15 * GPL LICENSE SUMMARY 16 * 17 * Copyright(c) 2014 Intel Corporation. 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of version 2 of the GNU General Public License as 21 * published by the Free Software Foundation. 22 * 23 * This program is distributed in the hope that it will be useful, but 24 * WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * General Public License for more details. 27 * 28 * Contact Information: 29 * James Guilford <james.guilford@intel.com> 30 * Sean Gulley <sean.m.gulley@intel.com> 31 * Chandramouli Narayanan <mouli@linux.intel.com> 32 * 33 * BSD LICENSE 34 * 35 * Copyright(c) 2014 Intel Corporation. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 41 * Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in 45 * the documentation and/or other materials provided with the 46 * distribution. 47 * Neither the name of Intel Corporation nor the names of its 48 * contributors may be used to endorse or promote products derived 49 * from this software without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 65#include <linux/linkage.h> 66 67#define VMOVDQ vmovdqu 68 69/* 70 * Note: the "x" prefix in these aliases means "this is an xmm register". The 71 * alias prefixes have no relation to XCTR where the "X" prefix means "XOR 72 * counter". 73 */ 74#define xdata0 %xmm0 75#define xdata1 %xmm1 76#define xdata2 %xmm2 77#define xdata3 %xmm3 78#define xdata4 %xmm4 79#define xdata5 %xmm5 80#define xdata6 %xmm6 81#define xdata7 %xmm7 82#define xcounter %xmm8 // CTR mode only 83#define xiv %xmm8 // XCTR mode only 84#define xbyteswap %xmm9 // CTR mode only 85#define xtmp %xmm9 // XCTR mode only 86#define xkey0 %xmm10 87#define xkey4 %xmm11 88#define xkey8 %xmm12 89#define xkey12 %xmm13 90#define xkeyA %xmm14 91#define xkeyB %xmm15 92 93#define p_in %rdi 94#define p_iv %rsi 95#define p_keys %rdx 96#define p_out %rcx 97#define num_bytes %r8 98#define counter %r9 // XCTR mode only 99#define tmp %r10 100#define DDQ_DATA 0 101#define XDATA 1 102#define KEY_128 1 103#define KEY_192 2 104#define KEY_256 3 105 106.section .rodata 107.align 16 108 109byteswap_const: 110 .octa 0x000102030405060708090A0B0C0D0E0F 111ddq_low_msk: 112 .octa 0x0000000000000000FFFFFFFFFFFFFFFF 113ddq_high_add_1: 114 .octa 0x00000000000000010000000000000000 115ddq_add_1: 116 .octa 0x00000000000000000000000000000001 117ddq_add_2: 118 .octa 0x00000000000000000000000000000002 119ddq_add_3: 120 .octa 0x00000000000000000000000000000003 121ddq_add_4: 122 .octa 0x00000000000000000000000000000004 123ddq_add_5: 124 .octa 0x00000000000000000000000000000005 125ddq_add_6: 126 .octa 0x00000000000000000000000000000006 127ddq_add_7: 128 .octa 0x00000000000000000000000000000007 129ddq_add_8: 130 .octa 0x00000000000000000000000000000008 131 132.text 133 134/* generate a unique variable for ddq_add_x */ 135 136/* generate a unique variable for xmm register */ 137.macro setxdata n 138 var_xdata = %xmm\n 139.endm 140 141/* club the numeric 'id' to the symbol 'name' */ 142 143.macro club name, id 144.altmacro 145 .if \name == XDATA 146 setxdata %\id 147 .endif 148.noaltmacro 149.endm 150 151/* 152 * do_aes num_in_par load_keys key_len 153 * This increments p_in, but not p_out 154 */ 155.macro do_aes b, k, key_len, xctr 156 .set by, \b 157 .set load_keys, \k 158 .set klen, \key_len 159 160 .if (load_keys) 161 vmovdqa 0*16(p_keys), xkey0 162 .endif 163 164 .if \xctr 165 movq counter, xtmp 166 .set i, 0 167 .rept (by) 168 club XDATA, i 169 vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata 170 .set i, (i +1) 171 .endr 172 .set i, 0 173 .rept (by) 174 club XDATA, i 175 vpxor xiv, var_xdata, var_xdata 176 .set i, (i +1) 177 .endr 178 .else 179 vpshufb xbyteswap, xcounter, xdata0 180 .set i, 1 181 .rept (by - 1) 182 club XDATA, i 183 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 184 vptest ddq_low_msk(%rip), var_xdata 185 jnz 1f 186 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 187 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 188 1: 189 vpshufb xbyteswap, var_xdata, var_xdata 190 .set i, (i +1) 191 .endr 192 .endif 193 194 vmovdqa 1*16(p_keys), xkeyA 195 196 vpxor xkey0, xdata0, xdata0 197 .if \xctr 198 add $by, counter 199 .else 200 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 201 vptest ddq_low_msk(%rip), xcounter 202 jnz 1f 203 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 204 1: 205 .endif 206 207 .set i, 1 208 .rept (by - 1) 209 club XDATA, i 210 vpxor xkey0, var_xdata, var_xdata 211 .set i, (i +1) 212 .endr 213 214 vmovdqa 2*16(p_keys), xkeyB 215 216 .set i, 0 217 .rept by 218 club XDATA, i 219 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 220 .set i, (i +1) 221 .endr 222 223 .if (klen == KEY_128) 224 .if (load_keys) 225 vmovdqa 3*16(p_keys), xkey4 226 .endif 227 .else 228 vmovdqa 3*16(p_keys), xkeyA 229 .endif 230 231 .set i, 0 232 .rept by 233 club XDATA, i 234 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 235 .set i, (i +1) 236 .endr 237 238 add $(16*by), p_in 239 240 .if (klen == KEY_128) 241 vmovdqa 4*16(p_keys), xkeyB 242 .else 243 .if (load_keys) 244 vmovdqa 4*16(p_keys), xkey4 245 .endif 246 .endif 247 248 .set i, 0 249 .rept by 250 club XDATA, i 251 /* key 3 */ 252 .if (klen == KEY_128) 253 vaesenc xkey4, var_xdata, var_xdata 254 .else 255 vaesenc xkeyA, var_xdata, var_xdata 256 .endif 257 .set i, (i +1) 258 .endr 259 260 vmovdqa 5*16(p_keys), xkeyA 261 262 .set i, 0 263 .rept by 264 club XDATA, i 265 /* key 4 */ 266 .if (klen == KEY_128) 267 vaesenc xkeyB, var_xdata, var_xdata 268 .else 269 vaesenc xkey4, var_xdata, var_xdata 270 .endif 271 .set i, (i +1) 272 .endr 273 274 .if (klen == KEY_128) 275 .if (load_keys) 276 vmovdqa 6*16(p_keys), xkey8 277 .endif 278 .else 279 vmovdqa 6*16(p_keys), xkeyB 280 .endif 281 282 .set i, 0 283 .rept by 284 club XDATA, i 285 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 286 .set i, (i +1) 287 .endr 288 289 vmovdqa 7*16(p_keys), xkeyA 290 291 .set i, 0 292 .rept by 293 club XDATA, i 294 /* key 6 */ 295 .if (klen == KEY_128) 296 vaesenc xkey8, var_xdata, var_xdata 297 .else 298 vaesenc xkeyB, var_xdata, var_xdata 299 .endif 300 .set i, (i +1) 301 .endr 302 303 .if (klen == KEY_128) 304 vmovdqa 8*16(p_keys), xkeyB 305 .else 306 .if (load_keys) 307 vmovdqa 8*16(p_keys), xkey8 308 .endif 309 .endif 310 311 .set i, 0 312 .rept by 313 club XDATA, i 314 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 315 .set i, (i +1) 316 .endr 317 318 .if (klen == KEY_128) 319 .if (load_keys) 320 vmovdqa 9*16(p_keys), xkey12 321 .endif 322 .else 323 vmovdqa 9*16(p_keys), xkeyA 324 .endif 325 326 .set i, 0 327 .rept by 328 club XDATA, i 329 /* key 8 */ 330 .if (klen == KEY_128) 331 vaesenc xkeyB, var_xdata, var_xdata 332 .else 333 vaesenc xkey8, var_xdata, var_xdata 334 .endif 335 .set i, (i +1) 336 .endr 337 338 vmovdqa 10*16(p_keys), xkeyB 339 340 .set i, 0 341 .rept by 342 club XDATA, i 343 /* key 9 */ 344 .if (klen == KEY_128) 345 vaesenc xkey12, var_xdata, var_xdata 346 .else 347 vaesenc xkeyA, var_xdata, var_xdata 348 .endif 349 .set i, (i +1) 350 .endr 351 352 .if (klen != KEY_128) 353 vmovdqa 11*16(p_keys), xkeyA 354 .endif 355 356 .set i, 0 357 .rept by 358 club XDATA, i 359 /* key 10 */ 360 .if (klen == KEY_128) 361 vaesenclast xkeyB, var_xdata, var_xdata 362 .else 363 vaesenc xkeyB, var_xdata, var_xdata 364 .endif 365 .set i, (i +1) 366 .endr 367 368 .if (klen != KEY_128) 369 .if (load_keys) 370 vmovdqa 12*16(p_keys), xkey12 371 .endif 372 373 .set i, 0 374 .rept by 375 club XDATA, i 376 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 377 .set i, (i +1) 378 .endr 379 380 .if (klen == KEY_256) 381 vmovdqa 13*16(p_keys), xkeyA 382 .endif 383 384 .set i, 0 385 .rept by 386 club XDATA, i 387 .if (klen == KEY_256) 388 /* key 12 */ 389 vaesenc xkey12, var_xdata, var_xdata 390 .else 391 vaesenclast xkey12, var_xdata, var_xdata 392 .endif 393 .set i, (i +1) 394 .endr 395 396 .if (klen == KEY_256) 397 vmovdqa 14*16(p_keys), xkeyB 398 399 .set i, 0 400 .rept by 401 club XDATA, i 402 /* key 13 */ 403 vaesenc xkeyA, var_xdata, var_xdata 404 .set i, (i +1) 405 .endr 406 407 .set i, 0 408 .rept by 409 club XDATA, i 410 /* key 14 */ 411 vaesenclast xkeyB, var_xdata, var_xdata 412 .set i, (i +1) 413 .endr 414 .endif 415 .endif 416 417 .set i, 0 418 .rept (by / 2) 419 .set j, (i+1) 420 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 421 VMOVDQ (j*16 - 16*by)(p_in), xkeyB 422 club XDATA, i 423 vpxor xkeyA, var_xdata, var_xdata 424 club XDATA, j 425 vpxor xkeyB, var_xdata, var_xdata 426 .set i, (i+2) 427 .endr 428 429 .if (i < by) 430 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 431 club XDATA, i 432 vpxor xkeyA, var_xdata, var_xdata 433 .endif 434 435 .set i, 0 436 .rept by 437 club XDATA, i 438 VMOVDQ var_xdata, i*16(p_out) 439 .set i, (i+1) 440 .endr 441.endm 442 443.macro do_aes_load val, key_len, xctr 444 do_aes \val, 1, \key_len, \xctr 445.endm 446 447.macro do_aes_noload val, key_len, xctr 448 do_aes \val, 0, \key_len, \xctr 449.endm 450 451/* main body of aes ctr load */ 452 453.macro do_aes_ctrmain key_len, xctr 454 cmp $16, num_bytes 455 jb .Ldo_return2\xctr\key_len 456 457 .if \xctr 458 shr $4, counter 459 vmovdqu (p_iv), xiv 460 .else 461 vmovdqa byteswap_const(%rip), xbyteswap 462 vmovdqu (p_iv), xcounter 463 vpshufb xbyteswap, xcounter, xcounter 464 .endif 465 466 mov num_bytes, tmp 467 and $(7*16), tmp 468 jz .Lmult_of_8_blks\xctr\key_len 469 470 /* 1 <= tmp <= 7 */ 471 cmp $(4*16), tmp 472 jg .Lgt4\xctr\key_len 473 je .Leq4\xctr\key_len 474 475.Llt4\xctr\key_len: 476 cmp $(2*16), tmp 477 jg .Leq3\xctr\key_len 478 je .Leq2\xctr\key_len 479 480.Leq1\xctr\key_len: 481 do_aes_load 1, \key_len, \xctr 482 add $(1*16), p_out 483 and $(~7*16), num_bytes 484 jz .Ldo_return2\xctr\key_len 485 jmp .Lmain_loop2\xctr\key_len 486 487.Leq2\xctr\key_len: 488 do_aes_load 2, \key_len, \xctr 489 add $(2*16), p_out 490 and $(~7*16), num_bytes 491 jz .Ldo_return2\xctr\key_len 492 jmp .Lmain_loop2\xctr\key_len 493 494 495.Leq3\xctr\key_len: 496 do_aes_load 3, \key_len, \xctr 497 add $(3*16), p_out 498 and $(~7*16), num_bytes 499 jz .Ldo_return2\xctr\key_len 500 jmp .Lmain_loop2\xctr\key_len 501 502.Leq4\xctr\key_len: 503 do_aes_load 4, \key_len, \xctr 504 add $(4*16), p_out 505 and $(~7*16), num_bytes 506 jz .Ldo_return2\xctr\key_len 507 jmp .Lmain_loop2\xctr\key_len 508 509.Lgt4\xctr\key_len: 510 cmp $(6*16), tmp 511 jg .Leq7\xctr\key_len 512 je .Leq6\xctr\key_len 513 514.Leq5\xctr\key_len: 515 do_aes_load 5, \key_len, \xctr 516 add $(5*16), p_out 517 and $(~7*16), num_bytes 518 jz .Ldo_return2\xctr\key_len 519 jmp .Lmain_loop2\xctr\key_len 520 521.Leq6\xctr\key_len: 522 do_aes_load 6, \key_len, \xctr 523 add $(6*16), p_out 524 and $(~7*16), num_bytes 525 jz .Ldo_return2\xctr\key_len 526 jmp .Lmain_loop2\xctr\key_len 527 528.Leq7\xctr\key_len: 529 do_aes_load 7, \key_len, \xctr 530 add $(7*16), p_out 531 and $(~7*16), num_bytes 532 jz .Ldo_return2\xctr\key_len 533 jmp .Lmain_loop2\xctr\key_len 534 535.Lmult_of_8_blks\xctr\key_len: 536 .if (\key_len != KEY_128) 537 vmovdqa 0*16(p_keys), xkey0 538 vmovdqa 4*16(p_keys), xkey4 539 vmovdqa 8*16(p_keys), xkey8 540 vmovdqa 12*16(p_keys), xkey12 541 .else 542 vmovdqa 0*16(p_keys), xkey0 543 vmovdqa 3*16(p_keys), xkey4 544 vmovdqa 6*16(p_keys), xkey8 545 vmovdqa 9*16(p_keys), xkey12 546 .endif 547.align 16 548.Lmain_loop2\xctr\key_len: 549 /* num_bytes is a multiple of 8 and >0 */ 550 do_aes_noload 8, \key_len, \xctr 551 add $(8*16), p_out 552 sub $(8*16), num_bytes 553 jne .Lmain_loop2\xctr\key_len 554 555.Ldo_return2\xctr\key_len: 556 .if !\xctr 557 /* return updated IV */ 558 vpshufb xbyteswap, xcounter, xcounter 559 vmovdqu xcounter, (p_iv) 560 .endif 561 RET 562.endm 563 564/* 565 * routine to do AES128 CTR enc/decrypt "by8" 566 * XMM registers are clobbered. 567 * Saving/restoring must be done at a higher level 568 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 569 * unsigned int num_bytes) 570 */ 571SYM_FUNC_START(aes_ctr_enc_128_avx_by8) 572 /* call the aes main loop */ 573 do_aes_ctrmain KEY_128 0 574 575SYM_FUNC_END(aes_ctr_enc_128_avx_by8) 576 577/* 578 * routine to do AES192 CTR enc/decrypt "by8" 579 * XMM registers are clobbered. 580 * Saving/restoring must be done at a higher level 581 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 582 * unsigned int num_bytes) 583 */ 584SYM_FUNC_START(aes_ctr_enc_192_avx_by8) 585 /* call the aes main loop */ 586 do_aes_ctrmain KEY_192 0 587 588SYM_FUNC_END(aes_ctr_enc_192_avx_by8) 589 590/* 591 * routine to do AES256 CTR enc/decrypt "by8" 592 * XMM registers are clobbered. 593 * Saving/restoring must be done at a higher level 594 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 595 * unsigned int num_bytes) 596 */ 597SYM_FUNC_START(aes_ctr_enc_256_avx_by8) 598 /* call the aes main loop */ 599 do_aes_ctrmain KEY_256 0 600 601SYM_FUNC_END(aes_ctr_enc_256_avx_by8) 602 603/* 604 * routine to do AES128 XCTR enc/decrypt "by8" 605 * XMM registers are clobbered. 606 * Saving/restoring must be done at a higher level 607 * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, 608 * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 609 */ 610SYM_FUNC_START(aes_xctr_enc_128_avx_by8) 611 /* call the aes main loop */ 612 do_aes_ctrmain KEY_128 1 613 614SYM_FUNC_END(aes_xctr_enc_128_avx_by8) 615 616/* 617 * routine to do AES192 XCTR enc/decrypt "by8" 618 * XMM registers are clobbered. 619 * Saving/restoring must be done at a higher level 620 * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, 621 * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 622 */ 623SYM_FUNC_START(aes_xctr_enc_192_avx_by8) 624 /* call the aes main loop */ 625 do_aes_ctrmain KEY_192 1 626 627SYM_FUNC_END(aes_xctr_enc_192_avx_by8) 628 629/* 630 * routine to do AES256 XCTR enc/decrypt "by8" 631 * XMM registers are clobbered. 632 * Saving/restoring must be done at a higher level 633 * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, 634 * u8* out, unsigned int num_bytes, unsigned int byte_ctr) 635 */ 636SYM_FUNC_START(aes_xctr_enc_256_avx_by8) 637 /* call the aes main loop */ 638 do_aes_ctrmain KEY_256 1 639 640SYM_FUNC_END(aes_xctr_enc_256_avx_by8) 641