1/* 2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3 * 4 * This is AES128/192/256 CTR mode optimization implementation. It requires 5 * the support of Intel(R) AESNI and AVX instructions. 6 * 7 * This work was inspired by the AES CTR mode optimization published 8 * in Intel Optimized IPSEC Cryptograhpic library. 9 * Additional information on it can be found at: 10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11 * 12 * This file is provided under a dual BSD/GPLv2 license. When using or 13 * redistributing this file, you may do so under either license. 14 * 15 * GPL LICENSE SUMMARY 16 * 17 * Copyright(c) 2014 Intel Corporation. 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of version 2 of the GNU General Public License as 21 * published by the Free Software Foundation. 22 * 23 * This program is distributed in the hope that it will be useful, but 24 * WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * General Public License for more details. 27 * 28 * Contact Information: 29 * James Guilford <james.guilford@intel.com> 30 * Sean Gulley <sean.m.gulley@intel.com> 31 * Chandramouli Narayanan <mouli@linux.intel.com> 32 * 33 * BSD LICENSE 34 * 35 * Copyright(c) 2014 Intel Corporation. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 41 * Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in 45 * the documentation and/or other materials provided with the 46 * distribution. 47 * Neither the name of Intel Corporation nor the names of its 48 * contributors may be used to endorse or promote products derived 49 * from this software without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 65#include <linux/linkage.h> 66#include <asm/inst.h> 67 68#define VMOVDQ vmovdqu 69 70#define xdata0 %xmm0 71#define xdata1 %xmm1 72#define xdata2 %xmm2 73#define xdata3 %xmm3 74#define xdata4 %xmm4 75#define xdata5 %xmm5 76#define xdata6 %xmm6 77#define xdata7 %xmm7 78#define xcounter %xmm8 79#define xbyteswap %xmm9 80#define xkey0 %xmm10 81#define xkey4 %xmm11 82#define xkey8 %xmm12 83#define xkey12 %xmm13 84#define xkeyA %xmm14 85#define xkeyB %xmm15 86 87#define p_in %rdi 88#define p_iv %rsi 89#define p_keys %rdx 90#define p_out %rcx 91#define num_bytes %r8 92 93#define tmp %r10 94#define DDQ_DATA 0 95#define XDATA 1 96#define KEY_128 1 97#define KEY_192 2 98#define KEY_256 3 99 100.section .rodata 101.align 16 102 103byteswap_const: 104 .octa 0x000102030405060708090A0B0C0D0E0F 105ddq_low_msk: 106 .octa 0x0000000000000000FFFFFFFFFFFFFFFF 107ddq_high_add_1: 108 .octa 0x00000000000000010000000000000000 109ddq_add_1: 110 .octa 0x00000000000000000000000000000001 111ddq_add_2: 112 .octa 0x00000000000000000000000000000002 113ddq_add_3: 114 .octa 0x00000000000000000000000000000003 115ddq_add_4: 116 .octa 0x00000000000000000000000000000004 117ddq_add_5: 118 .octa 0x00000000000000000000000000000005 119ddq_add_6: 120 .octa 0x00000000000000000000000000000006 121ddq_add_7: 122 .octa 0x00000000000000000000000000000007 123ddq_add_8: 124 .octa 0x00000000000000000000000000000008 125 126.text 127 128/* generate a unique variable for ddq_add_x */ 129 130.macro setddq n 131 var_ddq_add = ddq_add_\n 132.endm 133 134/* generate a unique variable for xmm register */ 135.macro setxdata n 136 var_xdata = %xmm\n 137.endm 138 139/* club the numeric 'id' to the symbol 'name' */ 140 141.macro club name, id 142.altmacro 143 .if \name == DDQ_DATA 144 setddq %\id 145 .elseif \name == XDATA 146 setxdata %\id 147 .endif 148.noaltmacro 149.endm 150 151/* 152 * do_aes num_in_par load_keys key_len 153 * This increments p_in, but not p_out 154 */ 155.macro do_aes b, k, key_len 156 .set by, \b 157 .set load_keys, \k 158 .set klen, \key_len 159 160 .if (load_keys) 161 vmovdqa 0*16(p_keys), xkey0 162 .endif 163 164 vpshufb xbyteswap, xcounter, xdata0 165 166 .set i, 1 167 .rept (by - 1) 168 club DDQ_DATA, i 169 club XDATA, i 170 vpaddq var_ddq_add(%rip), xcounter, var_xdata 171 vptest ddq_low_msk(%rip), var_xdata 172 jnz 1f 173 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 174 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 175 1: 176 vpshufb xbyteswap, var_xdata, var_xdata 177 .set i, (i +1) 178 .endr 179 180 vmovdqa 1*16(p_keys), xkeyA 181 182 vpxor xkey0, xdata0, xdata0 183 club DDQ_DATA, by 184 vpaddq var_ddq_add(%rip), xcounter, xcounter 185 vptest ddq_low_msk(%rip), xcounter 186 jnz 1f 187 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 188 1: 189 190 .set i, 1 191 .rept (by - 1) 192 club XDATA, i 193 vpxor xkey0, var_xdata, var_xdata 194 .set i, (i +1) 195 .endr 196 197 vmovdqa 2*16(p_keys), xkeyB 198 199 .set i, 0 200 .rept by 201 club XDATA, i 202 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 203 .set i, (i +1) 204 .endr 205 206 .if (klen == KEY_128) 207 .if (load_keys) 208 vmovdqa 3*16(p_keys), xkey4 209 .endif 210 .else 211 vmovdqa 3*16(p_keys), xkeyA 212 .endif 213 214 .set i, 0 215 .rept by 216 club XDATA, i 217 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 218 .set i, (i +1) 219 .endr 220 221 add $(16*by), p_in 222 223 .if (klen == KEY_128) 224 vmovdqa 4*16(p_keys), xkeyB 225 .else 226 .if (load_keys) 227 vmovdqa 4*16(p_keys), xkey4 228 .endif 229 .endif 230 231 .set i, 0 232 .rept by 233 club XDATA, i 234 /* key 3 */ 235 .if (klen == KEY_128) 236 vaesenc xkey4, var_xdata, var_xdata 237 .else 238 vaesenc xkeyA, var_xdata, var_xdata 239 .endif 240 .set i, (i +1) 241 .endr 242 243 vmovdqa 5*16(p_keys), xkeyA 244 245 .set i, 0 246 .rept by 247 club XDATA, i 248 /* key 4 */ 249 .if (klen == KEY_128) 250 vaesenc xkeyB, var_xdata, var_xdata 251 .else 252 vaesenc xkey4, var_xdata, var_xdata 253 .endif 254 .set i, (i +1) 255 .endr 256 257 .if (klen == KEY_128) 258 .if (load_keys) 259 vmovdqa 6*16(p_keys), xkey8 260 .endif 261 .else 262 vmovdqa 6*16(p_keys), xkeyB 263 .endif 264 265 .set i, 0 266 .rept by 267 club XDATA, i 268 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 269 .set i, (i +1) 270 .endr 271 272 vmovdqa 7*16(p_keys), xkeyA 273 274 .set i, 0 275 .rept by 276 club XDATA, i 277 /* key 6 */ 278 .if (klen == KEY_128) 279 vaesenc xkey8, var_xdata, var_xdata 280 .else 281 vaesenc xkeyB, var_xdata, var_xdata 282 .endif 283 .set i, (i +1) 284 .endr 285 286 .if (klen == KEY_128) 287 vmovdqa 8*16(p_keys), xkeyB 288 .else 289 .if (load_keys) 290 vmovdqa 8*16(p_keys), xkey8 291 .endif 292 .endif 293 294 .set i, 0 295 .rept by 296 club XDATA, i 297 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 298 .set i, (i +1) 299 .endr 300 301 .if (klen == KEY_128) 302 .if (load_keys) 303 vmovdqa 9*16(p_keys), xkey12 304 .endif 305 .else 306 vmovdqa 9*16(p_keys), xkeyA 307 .endif 308 309 .set i, 0 310 .rept by 311 club XDATA, i 312 /* key 8 */ 313 .if (klen == KEY_128) 314 vaesenc xkeyB, var_xdata, var_xdata 315 .else 316 vaesenc xkey8, var_xdata, var_xdata 317 .endif 318 .set i, (i +1) 319 .endr 320 321 vmovdqa 10*16(p_keys), xkeyB 322 323 .set i, 0 324 .rept by 325 club XDATA, i 326 /* key 9 */ 327 .if (klen == KEY_128) 328 vaesenc xkey12, var_xdata, var_xdata 329 .else 330 vaesenc xkeyA, var_xdata, var_xdata 331 .endif 332 .set i, (i +1) 333 .endr 334 335 .if (klen != KEY_128) 336 vmovdqa 11*16(p_keys), xkeyA 337 .endif 338 339 .set i, 0 340 .rept by 341 club XDATA, i 342 /* key 10 */ 343 .if (klen == KEY_128) 344 vaesenclast xkeyB, var_xdata, var_xdata 345 .else 346 vaesenc xkeyB, var_xdata, var_xdata 347 .endif 348 .set i, (i +1) 349 .endr 350 351 .if (klen != KEY_128) 352 .if (load_keys) 353 vmovdqa 12*16(p_keys), xkey12 354 .endif 355 356 .set i, 0 357 .rept by 358 club XDATA, i 359 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 360 .set i, (i +1) 361 .endr 362 363 .if (klen == KEY_256) 364 vmovdqa 13*16(p_keys), xkeyA 365 .endif 366 367 .set i, 0 368 .rept by 369 club XDATA, i 370 .if (klen == KEY_256) 371 /* key 12 */ 372 vaesenc xkey12, var_xdata, var_xdata 373 .else 374 vaesenclast xkey12, var_xdata, var_xdata 375 .endif 376 .set i, (i +1) 377 .endr 378 379 .if (klen == KEY_256) 380 vmovdqa 14*16(p_keys), xkeyB 381 382 .set i, 0 383 .rept by 384 club XDATA, i 385 /* key 13 */ 386 vaesenc xkeyA, var_xdata, var_xdata 387 .set i, (i +1) 388 .endr 389 390 .set i, 0 391 .rept by 392 club XDATA, i 393 /* key 14 */ 394 vaesenclast xkeyB, var_xdata, var_xdata 395 .set i, (i +1) 396 .endr 397 .endif 398 .endif 399 400 .set i, 0 401 .rept (by / 2) 402 .set j, (i+1) 403 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 404 VMOVDQ (j*16 - 16*by)(p_in), xkeyB 405 club XDATA, i 406 vpxor xkeyA, var_xdata, var_xdata 407 club XDATA, j 408 vpxor xkeyB, var_xdata, var_xdata 409 .set i, (i+2) 410 .endr 411 412 .if (i < by) 413 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 414 club XDATA, i 415 vpxor xkeyA, var_xdata, var_xdata 416 .endif 417 418 .set i, 0 419 .rept by 420 club XDATA, i 421 VMOVDQ var_xdata, i*16(p_out) 422 .set i, (i+1) 423 .endr 424.endm 425 426.macro do_aes_load val, key_len 427 do_aes \val, 1, \key_len 428.endm 429 430.macro do_aes_noload val, key_len 431 do_aes \val, 0, \key_len 432.endm 433 434/* main body of aes ctr load */ 435 436.macro do_aes_ctrmain key_len 437 cmp $16, num_bytes 438 jb .Ldo_return2\key_len 439 440 vmovdqa byteswap_const(%rip), xbyteswap 441 vmovdqu (p_iv), xcounter 442 vpshufb xbyteswap, xcounter, xcounter 443 444 mov num_bytes, tmp 445 and $(7*16), tmp 446 jz .Lmult_of_8_blks\key_len 447 448 /* 1 <= tmp <= 7 */ 449 cmp $(4*16), tmp 450 jg .Lgt4\key_len 451 je .Leq4\key_len 452 453.Llt4\key_len: 454 cmp $(2*16), tmp 455 jg .Leq3\key_len 456 je .Leq2\key_len 457 458.Leq1\key_len: 459 do_aes_load 1, \key_len 460 add $(1*16), p_out 461 and $(~7*16), num_bytes 462 jz .Ldo_return2\key_len 463 jmp .Lmain_loop2\key_len 464 465.Leq2\key_len: 466 do_aes_load 2, \key_len 467 add $(2*16), p_out 468 and $(~7*16), num_bytes 469 jz .Ldo_return2\key_len 470 jmp .Lmain_loop2\key_len 471 472 473.Leq3\key_len: 474 do_aes_load 3, \key_len 475 add $(3*16), p_out 476 and $(~7*16), num_bytes 477 jz .Ldo_return2\key_len 478 jmp .Lmain_loop2\key_len 479 480.Leq4\key_len: 481 do_aes_load 4, \key_len 482 add $(4*16), p_out 483 and $(~7*16), num_bytes 484 jz .Ldo_return2\key_len 485 jmp .Lmain_loop2\key_len 486 487.Lgt4\key_len: 488 cmp $(6*16), tmp 489 jg .Leq7\key_len 490 je .Leq6\key_len 491 492.Leq5\key_len: 493 do_aes_load 5, \key_len 494 add $(5*16), p_out 495 and $(~7*16), num_bytes 496 jz .Ldo_return2\key_len 497 jmp .Lmain_loop2\key_len 498 499.Leq6\key_len: 500 do_aes_load 6, \key_len 501 add $(6*16), p_out 502 and $(~7*16), num_bytes 503 jz .Ldo_return2\key_len 504 jmp .Lmain_loop2\key_len 505 506.Leq7\key_len: 507 do_aes_load 7, \key_len 508 add $(7*16), p_out 509 and $(~7*16), num_bytes 510 jz .Ldo_return2\key_len 511 jmp .Lmain_loop2\key_len 512 513.Lmult_of_8_blks\key_len: 514 .if (\key_len != KEY_128) 515 vmovdqa 0*16(p_keys), xkey0 516 vmovdqa 4*16(p_keys), xkey4 517 vmovdqa 8*16(p_keys), xkey8 518 vmovdqa 12*16(p_keys), xkey12 519 .else 520 vmovdqa 0*16(p_keys), xkey0 521 vmovdqa 3*16(p_keys), xkey4 522 vmovdqa 6*16(p_keys), xkey8 523 vmovdqa 9*16(p_keys), xkey12 524 .endif 525.align 16 526.Lmain_loop2\key_len: 527 /* num_bytes is a multiple of 8 and >0 */ 528 do_aes_noload 8, \key_len 529 add $(8*16), p_out 530 sub $(8*16), num_bytes 531 jne .Lmain_loop2\key_len 532 533.Ldo_return2\key_len: 534 /* return updated IV */ 535 vpshufb xbyteswap, xcounter, xcounter 536 vmovdqu xcounter, (p_iv) 537 ret 538.endm 539 540/* 541 * routine to do AES128 CTR enc/decrypt "by8" 542 * XMM registers are clobbered. 543 * Saving/restoring must be done at a higher level 544 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 545 * unsigned int num_bytes) 546 */ 547ENTRY(aes_ctr_enc_128_avx_by8) 548 /* call the aes main loop */ 549 do_aes_ctrmain KEY_128 550 551ENDPROC(aes_ctr_enc_128_avx_by8) 552 553/* 554 * routine to do AES192 CTR enc/decrypt "by8" 555 * XMM registers are clobbered. 556 * Saving/restoring must be done at a higher level 557 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 558 * unsigned int num_bytes) 559 */ 560ENTRY(aes_ctr_enc_192_avx_by8) 561 /* call the aes main loop */ 562 do_aes_ctrmain KEY_192 563 564ENDPROC(aes_ctr_enc_192_avx_by8) 565 566/* 567 * routine to do AES256 CTR enc/decrypt "by8" 568 * XMM registers are clobbered. 569 * Saving/restoring must be done at a higher level 570 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 571 * unsigned int num_bytes) 572 */ 573ENTRY(aes_ctr_enc_256_avx_by8) 574 /* call the aes main loop */ 575 do_aes_ctrmain KEY_256 576 577ENDPROC(aes_ctr_enc_256_avx_by8) 578