1/* 2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3 * 4 * This is AES128/192/256 CTR mode optimization implementation. It requires 5 * the support of Intel(R) AESNI and AVX instructions. 6 * 7 * This work was inspired by the AES CTR mode optimization published 8 * in Intel Optimized IPSEC Cryptograhpic library. 9 * Additional information on it can be found at: 10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11 * 12 * This file is provided under a dual BSD/GPLv2 license. When using or 13 * redistributing this file, you may do so under either license. 14 * 15 * GPL LICENSE SUMMARY 16 * 17 * Copyright(c) 2014 Intel Corporation. 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of version 2 of the GNU General Public License as 21 * published by the Free Software Foundation. 22 * 23 * This program is distributed in the hope that it will be useful, but 24 * WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * General Public License for more details. 27 * 28 * Contact Information: 29 * James Guilford <james.guilford@intel.com> 30 * Sean Gulley <sean.m.gulley@intel.com> 31 * Chandramouli Narayanan <mouli@linux.intel.com> 32 * 33 * BSD LICENSE 34 * 35 * Copyright(c) 2014 Intel Corporation. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 41 * Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in 45 * the documentation and/or other materials provided with the 46 * distribution. 47 * Neither the name of Intel Corporation nor the names of its 48 * contributors may be used to endorse or promote products derived 49 * from this software without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 65#include <linux/linkage.h> 66#include <asm/inst.h> 67 68#define VMOVDQ vmovdqu 69 70#define xdata0 %xmm0 71#define xdata1 %xmm1 72#define xdata2 %xmm2 73#define xdata3 %xmm3 74#define xdata4 %xmm4 75#define xdata5 %xmm5 76#define xdata6 %xmm6 77#define xdata7 %xmm7 78#define xcounter %xmm8 79#define xbyteswap %xmm9 80#define xkey0 %xmm10 81#define xkey4 %xmm11 82#define xkey8 %xmm12 83#define xkey12 %xmm13 84#define xkeyA %xmm14 85#define xkeyB %xmm15 86 87#define p_in %rdi 88#define p_iv %rsi 89#define p_keys %rdx 90#define p_out %rcx 91#define num_bytes %r8 92 93#define tmp %r10 94#define DDQ_DATA 0 95#define XDATA 1 96#define KEY_128 1 97#define KEY_192 2 98#define KEY_256 3 99 100.section .rodata 101.align 16 102 103byteswap_const: 104 .octa 0x000102030405060708090A0B0C0D0E0F 105ddq_low_msk: 106 .octa 0x0000000000000000FFFFFFFFFFFFFFFF 107ddq_high_add_1: 108 .octa 0x00000000000000010000000000000000 109ddq_add_1: 110 .octa 0x00000000000000000000000000000001 111ddq_add_2: 112 .octa 0x00000000000000000000000000000002 113ddq_add_3: 114 .octa 0x00000000000000000000000000000003 115ddq_add_4: 116 .octa 0x00000000000000000000000000000004 117ddq_add_5: 118 .octa 0x00000000000000000000000000000005 119ddq_add_6: 120 .octa 0x00000000000000000000000000000006 121ddq_add_7: 122 .octa 0x00000000000000000000000000000007 123ddq_add_8: 124 .octa 0x00000000000000000000000000000008 125 126.text 127 128/* generate a unique variable for ddq_add_x */ 129 130/* generate a unique variable for xmm register */ 131.macro setxdata n 132 var_xdata = %xmm\n 133.endm 134 135/* club the numeric 'id' to the symbol 'name' */ 136 137.macro club name, id 138.altmacro 139 .if \name == XDATA 140 setxdata %\id 141 .endif 142.noaltmacro 143.endm 144 145/* 146 * do_aes num_in_par load_keys key_len 147 * This increments p_in, but not p_out 148 */ 149.macro do_aes b, k, key_len 150 .set by, \b 151 .set load_keys, \k 152 .set klen, \key_len 153 154 .if (load_keys) 155 vmovdqa 0*16(p_keys), xkey0 156 .endif 157 158 vpshufb xbyteswap, xcounter, xdata0 159 160 .set i, 1 161 .rept (by - 1) 162 club XDATA, i 163 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 164 vptest ddq_low_msk(%rip), var_xdata 165 jnz 1f 166 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 167 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 168 1: 169 vpshufb xbyteswap, var_xdata, var_xdata 170 .set i, (i +1) 171 .endr 172 173 vmovdqa 1*16(p_keys), xkeyA 174 175 vpxor xkey0, xdata0, xdata0 176 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 177 vptest ddq_low_msk(%rip), xcounter 178 jnz 1f 179 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 180 1: 181 182 .set i, 1 183 .rept (by - 1) 184 club XDATA, i 185 vpxor xkey0, var_xdata, var_xdata 186 .set i, (i +1) 187 .endr 188 189 vmovdqa 2*16(p_keys), xkeyB 190 191 .set i, 0 192 .rept by 193 club XDATA, i 194 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 195 .set i, (i +1) 196 .endr 197 198 .if (klen == KEY_128) 199 .if (load_keys) 200 vmovdqa 3*16(p_keys), xkey4 201 .endif 202 .else 203 vmovdqa 3*16(p_keys), xkeyA 204 .endif 205 206 .set i, 0 207 .rept by 208 club XDATA, i 209 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 210 .set i, (i +1) 211 .endr 212 213 add $(16*by), p_in 214 215 .if (klen == KEY_128) 216 vmovdqa 4*16(p_keys), xkeyB 217 .else 218 .if (load_keys) 219 vmovdqa 4*16(p_keys), xkey4 220 .endif 221 .endif 222 223 .set i, 0 224 .rept by 225 club XDATA, i 226 /* key 3 */ 227 .if (klen == KEY_128) 228 vaesenc xkey4, var_xdata, var_xdata 229 .else 230 vaesenc xkeyA, var_xdata, var_xdata 231 .endif 232 .set i, (i +1) 233 .endr 234 235 vmovdqa 5*16(p_keys), xkeyA 236 237 .set i, 0 238 .rept by 239 club XDATA, i 240 /* key 4 */ 241 .if (klen == KEY_128) 242 vaesenc xkeyB, var_xdata, var_xdata 243 .else 244 vaesenc xkey4, var_xdata, var_xdata 245 .endif 246 .set i, (i +1) 247 .endr 248 249 .if (klen == KEY_128) 250 .if (load_keys) 251 vmovdqa 6*16(p_keys), xkey8 252 .endif 253 .else 254 vmovdqa 6*16(p_keys), xkeyB 255 .endif 256 257 .set i, 0 258 .rept by 259 club XDATA, i 260 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 261 .set i, (i +1) 262 .endr 263 264 vmovdqa 7*16(p_keys), xkeyA 265 266 .set i, 0 267 .rept by 268 club XDATA, i 269 /* key 6 */ 270 .if (klen == KEY_128) 271 vaesenc xkey8, var_xdata, var_xdata 272 .else 273 vaesenc xkeyB, var_xdata, var_xdata 274 .endif 275 .set i, (i +1) 276 .endr 277 278 .if (klen == KEY_128) 279 vmovdqa 8*16(p_keys), xkeyB 280 .else 281 .if (load_keys) 282 vmovdqa 8*16(p_keys), xkey8 283 .endif 284 .endif 285 286 .set i, 0 287 .rept by 288 club XDATA, i 289 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 290 .set i, (i +1) 291 .endr 292 293 .if (klen == KEY_128) 294 .if (load_keys) 295 vmovdqa 9*16(p_keys), xkey12 296 .endif 297 .else 298 vmovdqa 9*16(p_keys), xkeyA 299 .endif 300 301 .set i, 0 302 .rept by 303 club XDATA, i 304 /* key 8 */ 305 .if (klen == KEY_128) 306 vaesenc xkeyB, var_xdata, var_xdata 307 .else 308 vaesenc xkey8, var_xdata, var_xdata 309 .endif 310 .set i, (i +1) 311 .endr 312 313 vmovdqa 10*16(p_keys), xkeyB 314 315 .set i, 0 316 .rept by 317 club XDATA, i 318 /* key 9 */ 319 .if (klen == KEY_128) 320 vaesenc xkey12, var_xdata, var_xdata 321 .else 322 vaesenc xkeyA, var_xdata, var_xdata 323 .endif 324 .set i, (i +1) 325 .endr 326 327 .if (klen != KEY_128) 328 vmovdqa 11*16(p_keys), xkeyA 329 .endif 330 331 .set i, 0 332 .rept by 333 club XDATA, i 334 /* key 10 */ 335 .if (klen == KEY_128) 336 vaesenclast xkeyB, var_xdata, var_xdata 337 .else 338 vaesenc xkeyB, var_xdata, var_xdata 339 .endif 340 .set i, (i +1) 341 .endr 342 343 .if (klen != KEY_128) 344 .if (load_keys) 345 vmovdqa 12*16(p_keys), xkey12 346 .endif 347 348 .set i, 0 349 .rept by 350 club XDATA, i 351 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 352 .set i, (i +1) 353 .endr 354 355 .if (klen == KEY_256) 356 vmovdqa 13*16(p_keys), xkeyA 357 .endif 358 359 .set i, 0 360 .rept by 361 club XDATA, i 362 .if (klen == KEY_256) 363 /* key 12 */ 364 vaesenc xkey12, var_xdata, var_xdata 365 .else 366 vaesenclast xkey12, var_xdata, var_xdata 367 .endif 368 .set i, (i +1) 369 .endr 370 371 .if (klen == KEY_256) 372 vmovdqa 14*16(p_keys), xkeyB 373 374 .set i, 0 375 .rept by 376 club XDATA, i 377 /* key 13 */ 378 vaesenc xkeyA, var_xdata, var_xdata 379 .set i, (i +1) 380 .endr 381 382 .set i, 0 383 .rept by 384 club XDATA, i 385 /* key 14 */ 386 vaesenclast xkeyB, var_xdata, var_xdata 387 .set i, (i +1) 388 .endr 389 .endif 390 .endif 391 392 .set i, 0 393 .rept (by / 2) 394 .set j, (i+1) 395 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 396 VMOVDQ (j*16 - 16*by)(p_in), xkeyB 397 club XDATA, i 398 vpxor xkeyA, var_xdata, var_xdata 399 club XDATA, j 400 vpxor xkeyB, var_xdata, var_xdata 401 .set i, (i+2) 402 .endr 403 404 .if (i < by) 405 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 406 club XDATA, i 407 vpxor xkeyA, var_xdata, var_xdata 408 .endif 409 410 .set i, 0 411 .rept by 412 club XDATA, i 413 VMOVDQ var_xdata, i*16(p_out) 414 .set i, (i+1) 415 .endr 416.endm 417 418.macro do_aes_load val, key_len 419 do_aes \val, 1, \key_len 420.endm 421 422.macro do_aes_noload val, key_len 423 do_aes \val, 0, \key_len 424.endm 425 426/* main body of aes ctr load */ 427 428.macro do_aes_ctrmain key_len 429 cmp $16, num_bytes 430 jb .Ldo_return2\key_len 431 432 vmovdqa byteswap_const(%rip), xbyteswap 433 vmovdqu (p_iv), xcounter 434 vpshufb xbyteswap, xcounter, xcounter 435 436 mov num_bytes, tmp 437 and $(7*16), tmp 438 jz .Lmult_of_8_blks\key_len 439 440 /* 1 <= tmp <= 7 */ 441 cmp $(4*16), tmp 442 jg .Lgt4\key_len 443 je .Leq4\key_len 444 445.Llt4\key_len: 446 cmp $(2*16), tmp 447 jg .Leq3\key_len 448 je .Leq2\key_len 449 450.Leq1\key_len: 451 do_aes_load 1, \key_len 452 add $(1*16), p_out 453 and $(~7*16), num_bytes 454 jz .Ldo_return2\key_len 455 jmp .Lmain_loop2\key_len 456 457.Leq2\key_len: 458 do_aes_load 2, \key_len 459 add $(2*16), p_out 460 and $(~7*16), num_bytes 461 jz .Ldo_return2\key_len 462 jmp .Lmain_loop2\key_len 463 464 465.Leq3\key_len: 466 do_aes_load 3, \key_len 467 add $(3*16), p_out 468 and $(~7*16), num_bytes 469 jz .Ldo_return2\key_len 470 jmp .Lmain_loop2\key_len 471 472.Leq4\key_len: 473 do_aes_load 4, \key_len 474 add $(4*16), p_out 475 and $(~7*16), num_bytes 476 jz .Ldo_return2\key_len 477 jmp .Lmain_loop2\key_len 478 479.Lgt4\key_len: 480 cmp $(6*16), tmp 481 jg .Leq7\key_len 482 je .Leq6\key_len 483 484.Leq5\key_len: 485 do_aes_load 5, \key_len 486 add $(5*16), p_out 487 and $(~7*16), num_bytes 488 jz .Ldo_return2\key_len 489 jmp .Lmain_loop2\key_len 490 491.Leq6\key_len: 492 do_aes_load 6, \key_len 493 add $(6*16), p_out 494 and $(~7*16), num_bytes 495 jz .Ldo_return2\key_len 496 jmp .Lmain_loop2\key_len 497 498.Leq7\key_len: 499 do_aes_load 7, \key_len 500 add $(7*16), p_out 501 and $(~7*16), num_bytes 502 jz .Ldo_return2\key_len 503 jmp .Lmain_loop2\key_len 504 505.Lmult_of_8_blks\key_len: 506 .if (\key_len != KEY_128) 507 vmovdqa 0*16(p_keys), xkey0 508 vmovdqa 4*16(p_keys), xkey4 509 vmovdqa 8*16(p_keys), xkey8 510 vmovdqa 12*16(p_keys), xkey12 511 .else 512 vmovdqa 0*16(p_keys), xkey0 513 vmovdqa 3*16(p_keys), xkey4 514 vmovdqa 6*16(p_keys), xkey8 515 vmovdqa 9*16(p_keys), xkey12 516 .endif 517.align 16 518.Lmain_loop2\key_len: 519 /* num_bytes is a multiple of 8 and >0 */ 520 do_aes_noload 8, \key_len 521 add $(8*16), p_out 522 sub $(8*16), num_bytes 523 jne .Lmain_loop2\key_len 524 525.Ldo_return2\key_len: 526 /* return updated IV */ 527 vpshufb xbyteswap, xcounter, xcounter 528 vmovdqu xcounter, (p_iv) 529 ret 530.endm 531 532/* 533 * routine to do AES128 CTR enc/decrypt "by8" 534 * XMM registers are clobbered. 535 * Saving/restoring must be done at a higher level 536 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 537 * unsigned int num_bytes) 538 */ 539ENTRY(aes_ctr_enc_128_avx_by8) 540 /* call the aes main loop */ 541 do_aes_ctrmain KEY_128 542 543ENDPROC(aes_ctr_enc_128_avx_by8) 544 545/* 546 * routine to do AES192 CTR enc/decrypt "by8" 547 * XMM registers are clobbered. 548 * Saving/restoring must be done at a higher level 549 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 550 * unsigned int num_bytes) 551 */ 552ENTRY(aes_ctr_enc_192_avx_by8) 553 /* call the aes main loop */ 554 do_aes_ctrmain KEY_192 555 556ENDPROC(aes_ctr_enc_192_avx_by8) 557 558/* 559 * routine to do AES256 CTR enc/decrypt "by8" 560 * XMM registers are clobbered. 561 * Saving/restoring must be done at a higher level 562 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 563 * unsigned int num_bytes) 564 */ 565ENTRY(aes_ctr_enc_256_avx_by8) 566 /* call the aes main loop */ 567 do_aes_ctrmain KEY_256 568 569ENDPROC(aes_ctr_enc_256_avx_by8) 570