1/* 2 * x86_64/AVX2 assembler optimized version of Twofish 3 * 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 */ 12 13#include <linux/linkage.h> 14#include "glue_helper-asm-avx2.S" 15 16.file "twofish-avx2-asm_64.S" 17 18.data 19.align 16 20 21.Lvpshufb_mask0: 22.long 0x80808000 23.long 0x80808004 24.long 0x80808008 25.long 0x8080800c 26 27.Lbswap128_mask: 28 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 29.Lxts_gf128mul_and_shl1_mask_0: 30 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 31.Lxts_gf128mul_and_shl1_mask_1: 32 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 33 34.text 35 36/* structure of crypto context */ 37#define s0 0 38#define s1 1024 39#define s2 2048 40#define s3 3072 41#define w 4096 42#define k 4128 43 44/* register macros */ 45#define CTX %rdi 46 47#define RS0 CTX 48#define RS1 %r8 49#define RS2 %r9 50#define RS3 %r10 51#define RK %r11 52#define RW %rax 53#define RROUND %r12 54#define RROUNDd %r12d 55 56#define RA0 %ymm8 57#define RB0 %ymm9 58#define RC0 %ymm10 59#define RD0 %ymm11 60#define RA1 %ymm12 61#define RB1 %ymm13 62#define RC1 %ymm14 63#define RD1 %ymm15 64 65/* temp regs */ 66#define RX0 %ymm0 67#define RY0 %ymm1 68#define RX1 %ymm2 69#define RY1 %ymm3 70#define RT0 %ymm4 71#define RIDX %ymm5 72 73#define RX0x %xmm0 74#define RY0x %xmm1 75#define RX1x %xmm2 76#define RY1x %xmm3 77#define RT0x %xmm4 78 79/* vpgatherdd mask and '-1' */ 80#define RNOT %ymm6 81 82/* byte mask, (-1 >> 24) */ 83#define RBYTE %ymm7 84 85/********************************************************************** 86 16-way AVX2 twofish 87 **********************************************************************/ 88#define init_round_constants() \ 89 vpcmpeqd RNOT, RNOT, RNOT; \ 90 vpsrld $24, RNOT, RBYTE; \ 91 leaq k(CTX), RK; \ 92 leaq w(CTX), RW; \ 93 leaq s1(CTX), RS1; \ 94 leaq s2(CTX), RS2; \ 95 leaq s3(CTX), RS3; \ 96 97#define g16(ab, rs0, rs1, rs2, rs3, xy) \ 98 vpand RBYTE, ab ## 0, RIDX; \ 99 vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \ 100 vpcmpeqd RNOT, RNOT, RNOT; \ 101 \ 102 vpand RBYTE, ab ## 1, RIDX; \ 103 vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \ 104 vpcmpeqd RNOT, RNOT, RNOT; \ 105 \ 106 vpsrld $8, ab ## 0, RIDX; \ 107 vpand RBYTE, RIDX, RIDX; \ 108 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ 109 vpcmpeqd RNOT, RNOT, RNOT; \ 110 vpxor RT0, xy ## 0, xy ## 0; \ 111 \ 112 vpsrld $8, ab ## 1, RIDX; \ 113 vpand RBYTE, RIDX, RIDX; \ 114 vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \ 115 vpcmpeqd RNOT, RNOT, RNOT; \ 116 vpxor RT0, xy ## 1, xy ## 1; \ 117 \ 118 vpsrld $16, ab ## 0, RIDX; \ 119 vpand RBYTE, RIDX, RIDX; \ 120 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ 121 vpcmpeqd RNOT, RNOT, RNOT; \ 122 vpxor RT0, xy ## 0, xy ## 0; \ 123 \ 124 vpsrld $16, ab ## 1, RIDX; \ 125 vpand RBYTE, RIDX, RIDX; \ 126 vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \ 127 vpcmpeqd RNOT, RNOT, RNOT; \ 128 vpxor RT0, xy ## 1, xy ## 1; \ 129 \ 130 vpsrld $24, ab ## 0, RIDX; \ 131 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ 132 vpcmpeqd RNOT, RNOT, RNOT; \ 133 vpxor RT0, xy ## 0, xy ## 0; \ 134 \ 135 vpsrld $24, ab ## 1, RIDX; \ 136 vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \ 137 vpcmpeqd RNOT, RNOT, RNOT; \ 138 vpxor RT0, xy ## 1, xy ## 1; 139 140#define g1_16(a, x) \ 141 g16(a, RS0, RS1, RS2, RS3, x); 142 143#define g2_16(b, y) \ 144 g16(b, RS1, RS2, RS3, RS0, y); 145 146#define encrypt_round_end16(a, b, c, d, nk) \ 147 vpaddd RY0, RX0, RX0; \ 148 vpaddd RX0, RY0, RY0; \ 149 vpbroadcastd nk(RK,RROUND,8), RT0; \ 150 vpaddd RT0, RX0, RX0; \ 151 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 152 vpaddd RT0, RY0, RY0; \ 153 \ 154 vpxor RY0, d ## 0, d ## 0; \ 155 \ 156 vpxor RX0, c ## 0, c ## 0; \ 157 vpsrld $1, c ## 0, RT0; \ 158 vpslld $31, c ## 0, c ## 0; \ 159 vpor RT0, c ## 0, c ## 0; \ 160 \ 161 vpaddd RY1, RX1, RX1; \ 162 vpaddd RX1, RY1, RY1; \ 163 vpbroadcastd nk(RK,RROUND,8), RT0; \ 164 vpaddd RT0, RX1, RX1; \ 165 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 166 vpaddd RT0, RY1, RY1; \ 167 \ 168 vpxor RY1, d ## 1, d ## 1; \ 169 \ 170 vpxor RX1, c ## 1, c ## 1; \ 171 vpsrld $1, c ## 1, RT0; \ 172 vpslld $31, c ## 1, c ## 1; \ 173 vpor RT0, c ## 1, c ## 1; \ 174 175#define encrypt_round16(a, b, c, d, nk) \ 176 g2_16(b, RY); \ 177 \ 178 vpslld $1, b ## 0, RT0; \ 179 vpsrld $31, b ## 0, b ## 0; \ 180 vpor RT0, b ## 0, b ## 0; \ 181 \ 182 vpslld $1, b ## 1, RT0; \ 183 vpsrld $31, b ## 1, b ## 1; \ 184 vpor RT0, b ## 1, b ## 1; \ 185 \ 186 g1_16(a, RX); \ 187 \ 188 encrypt_round_end16(a, b, c, d, nk); 189 190#define encrypt_round_first16(a, b, c, d, nk) \ 191 vpslld $1, d ## 0, RT0; \ 192 vpsrld $31, d ## 0, d ## 0; \ 193 vpor RT0, d ## 0, d ## 0; \ 194 \ 195 vpslld $1, d ## 1, RT0; \ 196 vpsrld $31, d ## 1, d ## 1; \ 197 vpor RT0, d ## 1, d ## 1; \ 198 \ 199 encrypt_round16(a, b, c, d, nk); 200 201#define encrypt_round_last16(a, b, c, d, nk) \ 202 g2_16(b, RY); \ 203 \ 204 g1_16(a, RX); \ 205 \ 206 encrypt_round_end16(a, b, c, d, nk); 207 208#define decrypt_round_end16(a, b, c, d, nk) \ 209 vpaddd RY0, RX0, RX0; \ 210 vpaddd RX0, RY0, RY0; \ 211 vpbroadcastd nk(RK,RROUND,8), RT0; \ 212 vpaddd RT0, RX0, RX0; \ 213 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 214 vpaddd RT0, RY0, RY0; \ 215 \ 216 vpxor RX0, c ## 0, c ## 0; \ 217 \ 218 vpxor RY0, d ## 0, d ## 0; \ 219 vpsrld $1, d ## 0, RT0; \ 220 vpslld $31, d ## 0, d ## 0; \ 221 vpor RT0, d ## 0, d ## 0; \ 222 \ 223 vpaddd RY1, RX1, RX1; \ 224 vpaddd RX1, RY1, RY1; \ 225 vpbroadcastd nk(RK,RROUND,8), RT0; \ 226 vpaddd RT0, RX1, RX1; \ 227 vpbroadcastd 4+nk(RK,RROUND,8), RT0; \ 228 vpaddd RT0, RY1, RY1; \ 229 \ 230 vpxor RX1, c ## 1, c ## 1; \ 231 \ 232 vpxor RY1, d ## 1, d ## 1; \ 233 vpsrld $1, d ## 1, RT0; \ 234 vpslld $31, d ## 1, d ## 1; \ 235 vpor RT0, d ## 1, d ## 1; 236 237#define decrypt_round16(a, b, c, d, nk) \ 238 g1_16(a, RX); \ 239 \ 240 vpslld $1, a ## 0, RT0; \ 241 vpsrld $31, a ## 0, a ## 0; \ 242 vpor RT0, a ## 0, a ## 0; \ 243 \ 244 vpslld $1, a ## 1, RT0; \ 245 vpsrld $31, a ## 1, a ## 1; \ 246 vpor RT0, a ## 1, a ## 1; \ 247 \ 248 g2_16(b, RY); \ 249 \ 250 decrypt_round_end16(a, b, c, d, nk); 251 252#define decrypt_round_first16(a, b, c, d, nk) \ 253 vpslld $1, c ## 0, RT0; \ 254 vpsrld $31, c ## 0, c ## 0; \ 255 vpor RT0, c ## 0, c ## 0; \ 256 \ 257 vpslld $1, c ## 1, RT0; \ 258 vpsrld $31, c ## 1, c ## 1; \ 259 vpor RT0, c ## 1, c ## 1; \ 260 \ 261 decrypt_round16(a, b, c, d, nk) 262 263#define decrypt_round_last16(a, b, c, d, nk) \ 264 g1_16(a, RX); \ 265 \ 266 g2_16(b, RY); \ 267 \ 268 decrypt_round_end16(a, b, c, d, nk); 269 270#define encrypt_cycle16() \ 271 encrypt_round16(RA, RB, RC, RD, 0); \ 272 encrypt_round16(RC, RD, RA, RB, 8); 273 274#define encrypt_cycle_first16() \ 275 encrypt_round_first16(RA, RB, RC, RD, 0); \ 276 encrypt_round16(RC, RD, RA, RB, 8); 277 278#define encrypt_cycle_last16() \ 279 encrypt_round16(RA, RB, RC, RD, 0); \ 280 encrypt_round_last16(RC, RD, RA, RB, 8); 281 282#define decrypt_cycle16(n) \ 283 decrypt_round16(RC, RD, RA, RB, 8); \ 284 decrypt_round16(RA, RB, RC, RD, 0); 285 286#define decrypt_cycle_first16(n) \ 287 decrypt_round_first16(RC, RD, RA, RB, 8); \ 288 decrypt_round16(RA, RB, RC, RD, 0); 289 290#define decrypt_cycle_last16(n) \ 291 decrypt_round16(RC, RD, RA, RB, 8); \ 292 decrypt_round_last16(RA, RB, RC, RD, 0); 293 294#define transpose_4x4(x0,x1,x2,x3,t1,t2) \ 295 vpunpckhdq x1, x0, t2; \ 296 vpunpckldq x1, x0, x0; \ 297 \ 298 vpunpckldq x3, x2, t1; \ 299 vpunpckhdq x3, x2, x2; \ 300 \ 301 vpunpckhqdq t1, x0, x1; \ 302 vpunpcklqdq t1, x0, x0; \ 303 \ 304 vpunpckhqdq x2, t2, x3; \ 305 vpunpcklqdq x2, t2, x2; 306 307#define read_blocks8(offs,a,b,c,d) \ 308 transpose_4x4(a, b, c, d, RX0, RY0); 309 310#define write_blocks8(offs,a,b,c,d) \ 311 transpose_4x4(a, b, c, d, RX0, RY0); 312 313#define inpack_enc8(a,b,c,d) \ 314 vpbroadcastd 4*0(RW), RT0; \ 315 vpxor RT0, a, a; \ 316 \ 317 vpbroadcastd 4*1(RW), RT0; \ 318 vpxor RT0, b, b; \ 319 \ 320 vpbroadcastd 4*2(RW), RT0; \ 321 vpxor RT0, c, c; \ 322 \ 323 vpbroadcastd 4*3(RW), RT0; \ 324 vpxor RT0, d, d; 325 326#define outunpack_enc8(a,b,c,d) \ 327 vpbroadcastd 4*4(RW), RX0; \ 328 vpbroadcastd 4*5(RW), RY0; \ 329 vpxor RX0, c, RX0; \ 330 vpxor RY0, d, RY0; \ 331 \ 332 vpbroadcastd 4*6(RW), RT0; \ 333 vpxor RT0, a, c; \ 334 vpbroadcastd 4*7(RW), RT0; \ 335 vpxor RT0, b, d; \ 336 \ 337 vmovdqa RX0, a; \ 338 vmovdqa RY0, b; 339 340#define inpack_dec8(a,b,c,d) \ 341 vpbroadcastd 4*4(RW), RX0; \ 342 vpbroadcastd 4*5(RW), RY0; \ 343 vpxor RX0, a, RX0; \ 344 vpxor RY0, b, RY0; \ 345 \ 346 vpbroadcastd 4*6(RW), RT0; \ 347 vpxor RT0, c, a; \ 348 vpbroadcastd 4*7(RW), RT0; \ 349 vpxor RT0, d, b; \ 350 \ 351 vmovdqa RX0, c; \ 352 vmovdqa RY0, d; 353 354#define outunpack_dec8(a,b,c,d) \ 355 vpbroadcastd 4*0(RW), RT0; \ 356 vpxor RT0, a, a; \ 357 \ 358 vpbroadcastd 4*1(RW), RT0; \ 359 vpxor RT0, b, b; \ 360 \ 361 vpbroadcastd 4*2(RW), RT0; \ 362 vpxor RT0, c, c; \ 363 \ 364 vpbroadcastd 4*3(RW), RT0; \ 365 vpxor RT0, d, d; 366 367#define read_blocks16(a,b,c,d) \ 368 read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ 369 read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); 370 371#define write_blocks16(a,b,c,d) \ 372 write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ 373 write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); 374 375#define xor_blocks16(a,b,c,d) \ 376 xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \ 377 xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1); 378 379#define inpack_enc16(a,b,c,d) \ 380 inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ 381 inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); 382 383#define outunpack_enc16(a,b,c,d) \ 384 outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \ 385 outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1); 386 387#define inpack_dec16(a,b,c,d) \ 388 inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ 389 inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); 390 391#define outunpack_dec16(a,b,c,d) \ 392 outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ 393 outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); 394 395.align 8 396__twofish_enc_blk16: 397 /* input: 398 * %rdi: ctx, CTX 399 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext 400 * output: 401 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext 402 */ 403 init_round_constants(); 404 405 read_blocks16(RA, RB, RC, RD); 406 inpack_enc16(RA, RB, RC, RD); 407 408 xorl RROUNDd, RROUNDd; 409 encrypt_cycle_first16(); 410 movl $2, RROUNDd; 411 412.align 4 413.L__enc_loop: 414 encrypt_cycle16(); 415 416 addl $2, RROUNDd; 417 cmpl $14, RROUNDd; 418 jne .L__enc_loop; 419 420 encrypt_cycle_last16(); 421 422 outunpack_enc16(RA, RB, RC, RD); 423 write_blocks16(RA, RB, RC, RD); 424 425 ret; 426ENDPROC(__twofish_enc_blk16) 427 428.align 8 429__twofish_dec_blk16: 430 /* input: 431 * %rdi: ctx, CTX 432 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext 433 * output: 434 * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext 435 */ 436 init_round_constants(); 437 438 read_blocks16(RA, RB, RC, RD); 439 inpack_dec16(RA, RB, RC, RD); 440 441 movl $14, RROUNDd; 442 decrypt_cycle_first16(); 443 movl $12, RROUNDd; 444 445.align 4 446.L__dec_loop: 447 decrypt_cycle16(); 448 449 addl $-2, RROUNDd; 450 jnz .L__dec_loop; 451 452 decrypt_cycle_last16(); 453 454 outunpack_dec16(RA, RB, RC, RD); 455 write_blocks16(RA, RB, RC, RD); 456 457 ret; 458ENDPROC(__twofish_dec_blk16) 459 460ENTRY(twofish_ecb_enc_16way) 461 /* input: 462 * %rdi: ctx, CTX 463 * %rsi: dst 464 * %rdx: src 465 */ 466 467 vzeroupper; 468 pushq %r12; 469 470 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 471 472 call __twofish_enc_blk16; 473 474 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 475 476 popq %r12; 477 vzeroupper; 478 479 ret; 480ENDPROC(twofish_ecb_enc_16way) 481 482ENTRY(twofish_ecb_dec_16way) 483 /* input: 484 * %rdi: ctx, CTX 485 * %rsi: dst 486 * %rdx: src 487 */ 488 489 vzeroupper; 490 pushq %r12; 491 492 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 493 494 call __twofish_dec_blk16; 495 496 store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 497 498 popq %r12; 499 vzeroupper; 500 501 ret; 502ENDPROC(twofish_ecb_dec_16way) 503 504ENTRY(twofish_cbc_dec_16way) 505 /* input: 506 * %rdi: ctx, CTX 507 * %rsi: dst 508 * %rdx: src 509 */ 510 511 vzeroupper; 512 pushq %r12; 513 514 load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 515 516 call __twofish_dec_blk16; 517 518 store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1, 519 RX0); 520 521 popq %r12; 522 vzeroupper; 523 524 ret; 525ENDPROC(twofish_cbc_dec_16way) 526 527ENTRY(twofish_ctr_16way) 528 /* input: 529 * %rdi: ctx, CTX 530 * %rsi: dst (16 blocks) 531 * %rdx: src (16 blocks) 532 * %rcx: iv (little endian, 128bit) 533 */ 534 535 vzeroupper; 536 pushq %r12; 537 538 load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1, 539 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, 540 RBYTE); 541 542 call __twofish_enc_blk16; 543 544 store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 545 546 popq %r12; 547 vzeroupper; 548 549 ret; 550ENDPROC(twofish_ctr_16way) 551 552.align 8 553twofish_xts_crypt_16way: 554 /* input: 555 * %rdi: ctx, CTX 556 * %rsi: dst (16 blocks) 557 * %rdx: src (16 blocks) 558 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 559 * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16 560 */ 561 562 vzeroupper; 563 pushq %r12; 564 565 load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, 566 RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT, 567 .Lxts_gf128mul_and_shl1_mask_0, 568 .Lxts_gf128mul_and_shl1_mask_1); 569 570 call *%r8; 571 572 store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1); 573 574 popq %r12; 575 vzeroupper; 576 577 ret; 578ENDPROC(twofish_xts_crypt_16way) 579 580ENTRY(twofish_xts_enc_16way) 581 /* input: 582 * %rdi: ctx, CTX 583 * %rsi: dst (16 blocks) 584 * %rdx: src (16 blocks) 585 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 586 */ 587 leaq __twofish_enc_blk16, %r8; 588 jmp twofish_xts_crypt_16way; 589ENDPROC(twofish_xts_enc_16way) 590 591ENTRY(twofish_xts_dec_16way) 592 /* input: 593 * %rdi: ctx, CTX 594 * %rsi: dst (16 blocks) 595 * %rdx: src (16 blocks) 596 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 597 */ 598 leaq __twofish_dec_blk16, %r8; 599 jmp twofish_xts_crypt_16way; 600ENDPROC(twofish_xts_dec_16way) 601