1/* 2 * x86_64/AVX2 assembler optimized version of Serpent 3 * 4 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * Based on AVX assembler implementation of Serpent by: 7 * Copyright © 2012 Johannes Goetzfried 8 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 */ 16 17#include <linux/linkage.h> 18#include "glue_helper-asm-avx2.S" 19 20.file "serpent-avx2-asm_64.S" 21 22.data 23.align 16 24 25.Lbswap128_mask: 26 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 27.Lxts_gf128mul_and_shl1_mask_0: 28 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 29.Lxts_gf128mul_and_shl1_mask_1: 30 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 31 32.text 33 34#define CTX %rdi 35 36#define RNOT %ymm0 37#define tp %ymm1 38 39#define RA1 %ymm2 40#define RA2 %ymm3 41#define RB1 %ymm4 42#define RB2 %ymm5 43#define RC1 %ymm6 44#define RC2 %ymm7 45#define RD1 %ymm8 46#define RD2 %ymm9 47#define RE1 %ymm10 48#define RE2 %ymm11 49 50#define RK0 %ymm12 51#define RK1 %ymm13 52#define RK2 %ymm14 53#define RK3 %ymm15 54 55#define RK0x %xmm12 56#define RK1x %xmm13 57#define RK2x %xmm14 58#define RK3x %xmm15 59 60#define S0_1(x0, x1, x2, x3, x4) \ 61 vpor x0, x3, tp; \ 62 vpxor x3, x0, x0; \ 63 vpxor x2, x3, x4; \ 64 vpxor RNOT, x4, x4; \ 65 vpxor x1, tp, x3; \ 66 vpand x0, x1, x1; \ 67 vpxor x4, x1, x1; \ 68 vpxor x0, x2, x2; 69#define S0_2(x0, x1, x2, x3, x4) \ 70 vpxor x3, x0, x0; \ 71 vpor x0, x4, x4; \ 72 vpxor x2, x0, x0; \ 73 vpand x1, x2, x2; \ 74 vpxor x2, x3, x3; \ 75 vpxor RNOT, x1, x1; \ 76 vpxor x4, x2, x2; \ 77 vpxor x2, x1, x1; 78 79#define S1_1(x0, x1, x2, x3, x4) \ 80 vpxor x0, x1, tp; \ 81 vpxor x3, x0, x0; \ 82 vpxor RNOT, x3, x3; \ 83 vpand tp, x1, x4; \ 84 vpor tp, x0, x0; \ 85 vpxor x2, x3, x3; \ 86 vpxor x3, x0, x0; \ 87 vpxor x3, tp, x1; 88#define S1_2(x0, x1, x2, x3, x4) \ 89 vpxor x4, x3, x3; \ 90 vpor x4, x1, x1; \ 91 vpxor x2, x4, x4; \ 92 vpand x0, x2, x2; \ 93 vpxor x1, x2, x2; \ 94 vpor x0, x1, x1; \ 95 vpxor RNOT, x0, x0; \ 96 vpxor x2, x0, x0; \ 97 vpxor x1, x4, x4; 98 99#define S2_1(x0, x1, x2, x3, x4) \ 100 vpxor RNOT, x3, x3; \ 101 vpxor x0, x1, x1; \ 102 vpand x2, x0, tp; \ 103 vpxor x3, tp, tp; \ 104 vpor x0, x3, x3; \ 105 vpxor x1, x2, x2; \ 106 vpxor x1, x3, x3; \ 107 vpand tp, x1, x1; 108#define S2_2(x0, x1, x2, x3, x4) \ 109 vpxor x2, tp, tp; \ 110 vpand x3, x2, x2; \ 111 vpor x1, x3, x3; \ 112 vpxor RNOT, tp, tp; \ 113 vpxor tp, x3, x3; \ 114 vpxor tp, x0, x4; \ 115 vpxor x2, tp, x0; \ 116 vpor x2, x1, x1; 117 118#define S3_1(x0, x1, x2, x3, x4) \ 119 vpxor x3, x1, tp; \ 120 vpor x0, x3, x3; \ 121 vpand x0, x1, x4; \ 122 vpxor x2, x0, x0; \ 123 vpxor tp, x2, x2; \ 124 vpand x3, tp, x1; \ 125 vpxor x3, x2, x2; \ 126 vpor x4, x0, x0; \ 127 vpxor x3, x4, x4; 128#define S3_2(x0, x1, x2, x3, x4) \ 129 vpxor x0, x1, x1; \ 130 vpand x3, x0, x0; \ 131 vpand x4, x3, x3; \ 132 vpxor x2, x3, x3; \ 133 vpor x1, x4, x4; \ 134 vpand x1, x2, x2; \ 135 vpxor x3, x4, x4; \ 136 vpxor x3, x0, x0; \ 137 vpxor x2, x3, x3; 138 139#define S4_1(x0, x1, x2, x3, x4) \ 140 vpand x0, x3, tp; \ 141 vpxor x3, x0, x0; \ 142 vpxor x2, tp, tp; \ 143 vpor x3, x2, x2; \ 144 vpxor x1, x0, x0; \ 145 vpxor tp, x3, x4; \ 146 vpor x0, x2, x2; \ 147 vpxor x1, x2, x2; 148#define S4_2(x0, x1, x2, x3, x4) \ 149 vpand x0, x1, x1; \ 150 vpxor x4, x1, x1; \ 151 vpand x2, x4, x4; \ 152 vpxor tp, x2, x2; \ 153 vpxor x0, x4, x4; \ 154 vpor x1, tp, x3; \ 155 vpxor RNOT, x1, x1; \ 156 vpxor x0, x3, x3; 157 158#define S5_1(x0, x1, x2, x3, x4) \ 159 vpor x0, x1, tp; \ 160 vpxor tp, x2, x2; \ 161 vpxor RNOT, x3, x3; \ 162 vpxor x0, x1, x4; \ 163 vpxor x2, x0, x0; \ 164 vpand x4, tp, x1; \ 165 vpor x3, x4, x4; \ 166 vpxor x0, x4, x4; 167#define S5_2(x0, x1, x2, x3, x4) \ 168 vpand x3, x0, x0; \ 169 vpxor x3, x1, x1; \ 170 vpxor x2, x3, x3; \ 171 vpxor x1, x0, x0; \ 172 vpand x4, x2, x2; \ 173 vpxor x2, x1, x1; \ 174 vpand x0, x2, x2; \ 175 vpxor x2, x3, x3; 176 177#define S6_1(x0, x1, x2, x3, x4) \ 178 vpxor x0, x3, x3; \ 179 vpxor x2, x1, tp; \ 180 vpxor x0, x2, x2; \ 181 vpand x3, x0, x0; \ 182 vpor x3, tp, tp; \ 183 vpxor RNOT, x1, x4; \ 184 vpxor tp, x0, x0; \ 185 vpxor x2, tp, x1; 186#define S6_2(x0, x1, x2, x3, x4) \ 187 vpxor x4, x3, x3; \ 188 vpxor x0, x4, x4; \ 189 vpand x0, x2, x2; \ 190 vpxor x1, x4, x4; \ 191 vpxor x3, x2, x2; \ 192 vpand x1, x3, x3; \ 193 vpxor x0, x3, x3; \ 194 vpxor x2, x1, x1; 195 196#define S7_1(x0, x1, x2, x3, x4) \ 197 vpxor RNOT, x1, tp; \ 198 vpxor RNOT, x0, x0; \ 199 vpand x2, tp, x1; \ 200 vpxor x3, x1, x1; \ 201 vpor tp, x3, x3; \ 202 vpxor x2, tp, x4; \ 203 vpxor x3, x2, x2; \ 204 vpxor x0, x3, x3; \ 205 vpor x1, x0, x0; 206#define S7_2(x0, x1, x2, x3, x4) \ 207 vpand x0, x2, x2; \ 208 vpxor x4, x0, x0; \ 209 vpxor x3, x4, x4; \ 210 vpand x0, x3, x3; \ 211 vpxor x1, x4, x4; \ 212 vpxor x4, x2, x2; \ 213 vpxor x1, x3, x3; \ 214 vpor x0, x4, x4; \ 215 vpxor x1, x4, x4; 216 217#define SI0_1(x0, x1, x2, x3, x4) \ 218 vpxor x0, x1, x1; \ 219 vpor x1, x3, tp; \ 220 vpxor x1, x3, x4; \ 221 vpxor RNOT, x0, x0; \ 222 vpxor tp, x2, x2; \ 223 vpxor x0, tp, x3; \ 224 vpand x1, x0, x0; \ 225 vpxor x2, x0, x0; 226#define SI0_2(x0, x1, x2, x3, x4) \ 227 vpand x3, x2, x2; \ 228 vpxor x4, x3, x3; \ 229 vpxor x3, x2, x2; \ 230 vpxor x3, x1, x1; \ 231 vpand x0, x3, x3; \ 232 vpxor x0, x1, x1; \ 233 vpxor x2, x0, x0; \ 234 vpxor x3, x4, x4; 235 236#define SI1_1(x0, x1, x2, x3, x4) \ 237 vpxor x3, x1, x1; \ 238 vpxor x2, x0, tp; \ 239 vpxor RNOT, x2, x2; \ 240 vpor x1, x0, x4; \ 241 vpxor x3, x4, x4; \ 242 vpand x1, x3, x3; \ 243 vpxor x2, x1, x1; \ 244 vpand x4, x2, x2; 245#define SI1_2(x0, x1, x2, x3, x4) \ 246 vpxor x1, x4, x4; \ 247 vpor x3, x1, x1; \ 248 vpxor tp, x3, x3; \ 249 vpxor tp, x2, x2; \ 250 vpor x4, tp, x0; \ 251 vpxor x4, x2, x2; \ 252 vpxor x0, x1, x1; \ 253 vpxor x1, x4, x4; 254 255#define SI2_1(x0, x1, x2, x3, x4) \ 256 vpxor x1, x2, x2; \ 257 vpxor RNOT, x3, tp; \ 258 vpor x2, tp, tp; \ 259 vpxor x3, x2, x2; \ 260 vpxor x0, x3, x4; \ 261 vpxor x1, tp, x3; \ 262 vpor x2, x1, x1; \ 263 vpxor x0, x2, x2; 264#define SI2_2(x0, x1, x2, x3, x4) \ 265 vpxor x4, x1, x1; \ 266 vpor x3, x4, x4; \ 267 vpxor x3, x2, x2; \ 268 vpxor x2, x4, x4; \ 269 vpand x1, x2, x2; \ 270 vpxor x3, x2, x2; \ 271 vpxor x4, x3, x3; \ 272 vpxor x0, x4, x4; 273 274#define SI3_1(x0, x1, x2, x3, x4) \ 275 vpxor x1, x2, x2; \ 276 vpand x2, x1, tp; \ 277 vpxor x0, tp, tp; \ 278 vpor x1, x0, x0; \ 279 vpxor x3, x1, x4; \ 280 vpxor x3, x0, x0; \ 281 vpor tp, x3, x3; \ 282 vpxor x2, tp, x1; 283#define SI3_2(x0, x1, x2, x3, x4) \ 284 vpxor x3, x1, x1; \ 285 vpxor x2, x0, x0; \ 286 vpxor x3, x2, x2; \ 287 vpand x1, x3, x3; \ 288 vpxor x0, x1, x1; \ 289 vpand x2, x0, x0; \ 290 vpxor x3, x4, x4; \ 291 vpxor x0, x3, x3; \ 292 vpxor x1, x0, x0; 293 294#define SI4_1(x0, x1, x2, x3, x4) \ 295 vpxor x3, x2, x2; \ 296 vpand x1, x0, tp; \ 297 vpxor x2, tp, tp; \ 298 vpor x3, x2, x2; \ 299 vpxor RNOT, x0, x4; \ 300 vpxor tp, x1, x1; \ 301 vpxor x2, tp, x0; \ 302 vpand x4, x2, x2; 303#define SI4_2(x0, x1, x2, x3, x4) \ 304 vpxor x0, x2, x2; \ 305 vpor x4, x0, x0; \ 306 vpxor x3, x0, x0; \ 307 vpand x2, x3, x3; \ 308 vpxor x3, x4, x4; \ 309 vpxor x1, x3, x3; \ 310 vpand x0, x1, x1; \ 311 vpxor x1, x4, x4; \ 312 vpxor x3, x0, x0; 313 314#define SI5_1(x0, x1, x2, x3, x4) \ 315 vpor x2, x1, tp; \ 316 vpxor x1, x2, x2; \ 317 vpxor x3, tp, tp; \ 318 vpand x1, x3, x3; \ 319 vpxor x3, x2, x2; \ 320 vpor x0, x3, x3; \ 321 vpxor RNOT, x0, x0; \ 322 vpxor x2, x3, x3; \ 323 vpor x0, x2, x2; 324#define SI5_2(x0, x1, x2, x3, x4) \ 325 vpxor tp, x1, x4; \ 326 vpxor x4, x2, x2; \ 327 vpand x0, x4, x4; \ 328 vpxor tp, x0, x0; \ 329 vpxor x3, tp, x1; \ 330 vpand x2, x0, x0; \ 331 vpxor x3, x2, x2; \ 332 vpxor x2, x0, x0; \ 333 vpxor x4, x2, x2; \ 334 vpxor x3, x4, x4; 335 336#define SI6_1(x0, x1, x2, x3, x4) \ 337 vpxor x2, x0, x0; \ 338 vpand x3, x0, tp; \ 339 vpxor x3, x2, x2; \ 340 vpxor x2, tp, tp; \ 341 vpxor x1, x3, x3; \ 342 vpor x0, x2, x2; \ 343 vpxor x3, x2, x2; \ 344 vpand tp, x3, x3; 345#define SI6_2(x0, x1, x2, x3, x4) \ 346 vpxor RNOT, tp, tp; \ 347 vpxor x1, x3, x3; \ 348 vpand x2, x1, x1; \ 349 vpxor tp, x0, x4; \ 350 vpxor x4, x3, x3; \ 351 vpxor x2, x4, x4; \ 352 vpxor x1, tp, x0; \ 353 vpxor x0, x2, x2; 354 355#define SI7_1(x0, x1, x2, x3, x4) \ 356 vpand x0, x3, tp; \ 357 vpxor x2, x0, x0; \ 358 vpor x3, x2, x2; \ 359 vpxor x1, x3, x4; \ 360 vpxor RNOT, x0, x0; \ 361 vpor tp, x1, x1; \ 362 vpxor x0, x4, x4; \ 363 vpand x2, x0, x0; \ 364 vpxor x1, x0, x0; 365#define SI7_2(x0, x1, x2, x3, x4) \ 366 vpand x2, x1, x1; \ 367 vpxor x2, tp, x3; \ 368 vpxor x3, x4, x4; \ 369 vpand x3, x2, x2; \ 370 vpor x0, x3, x3; \ 371 vpxor x4, x1, x1; \ 372 vpxor x4, x3, x3; \ 373 vpand x0, x4, x4; \ 374 vpxor x2, x4, x4; 375 376#define get_key(i,j,t) \ 377 vpbroadcastd (4*(i)+(j))*4(CTX), t; 378 379#define K2(x0, x1, x2, x3, x4, i) \ 380 get_key(i, 0, RK0); \ 381 get_key(i, 1, RK1); \ 382 get_key(i, 2, RK2); \ 383 get_key(i, 3, RK3); \ 384 vpxor RK0, x0 ## 1, x0 ## 1; \ 385 vpxor RK1, x1 ## 1, x1 ## 1; \ 386 vpxor RK2, x2 ## 1, x2 ## 1; \ 387 vpxor RK3, x3 ## 1, x3 ## 1; \ 388 vpxor RK0, x0 ## 2, x0 ## 2; \ 389 vpxor RK1, x1 ## 2, x1 ## 2; \ 390 vpxor RK2, x2 ## 2, x2 ## 2; \ 391 vpxor RK3, x3 ## 2, x3 ## 2; 392 393#define LK2(x0, x1, x2, x3, x4, i) \ 394 vpslld $13, x0 ## 1, x4 ## 1; \ 395 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 396 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 397 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 398 vpslld $3, x2 ## 1, x4 ## 1; \ 399 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 400 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 401 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 402 vpslld $13, x0 ## 2, x4 ## 2; \ 403 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 404 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 405 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 406 vpslld $3, x2 ## 2, x4 ## 2; \ 407 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 408 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 409 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 410 vpslld $1, x1 ## 1, x4 ## 1; \ 411 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 412 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 413 vpslld $3, x0 ## 1, x4 ## 1; \ 414 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 415 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 416 get_key(i, 1, RK1); \ 417 vpslld $1, x1 ## 2, x4 ## 2; \ 418 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 419 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 420 vpslld $3, x0 ## 2, x4 ## 2; \ 421 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 422 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 423 get_key(i, 3, RK3); \ 424 vpslld $7, x3 ## 1, x4 ## 1; \ 425 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 426 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 427 vpslld $7, x1 ## 1, x4 ## 1; \ 428 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 429 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 430 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 431 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 432 get_key(i, 0, RK0); \ 433 vpslld $7, x3 ## 2, x4 ## 2; \ 434 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 435 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 436 vpslld $7, x1 ## 2, x4 ## 2; \ 437 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 438 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 439 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 440 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 441 get_key(i, 2, RK2); \ 442 vpxor RK1, x1 ## 1, x1 ## 1; \ 443 vpxor RK3, x3 ## 1, x3 ## 1; \ 444 vpslld $5, x0 ## 1, x4 ## 1; \ 445 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 446 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 447 vpslld $22, x2 ## 1, x4 ## 1; \ 448 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 449 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 450 vpxor RK0, x0 ## 1, x0 ## 1; \ 451 vpxor RK2, x2 ## 1, x2 ## 1; \ 452 vpxor RK1, x1 ## 2, x1 ## 2; \ 453 vpxor RK3, x3 ## 2, x3 ## 2; \ 454 vpslld $5, x0 ## 2, x4 ## 2; \ 455 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 456 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 457 vpslld $22, x2 ## 2, x4 ## 2; \ 458 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 459 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 460 vpxor RK0, x0 ## 2, x0 ## 2; \ 461 vpxor RK2, x2 ## 2, x2 ## 2; 462 463#define KL2(x0, x1, x2, x3, x4, i) \ 464 vpxor RK0, x0 ## 1, x0 ## 1; \ 465 vpxor RK2, x2 ## 1, x2 ## 1; \ 466 vpsrld $5, x0 ## 1, x4 ## 1; \ 467 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 468 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 469 vpxor RK3, x3 ## 1, x3 ## 1; \ 470 vpxor RK1, x1 ## 1, x1 ## 1; \ 471 vpsrld $22, x2 ## 1, x4 ## 1; \ 472 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 473 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 474 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 475 vpxor RK0, x0 ## 2, x0 ## 2; \ 476 vpxor RK2, x2 ## 2, x2 ## 2; \ 477 vpsrld $5, x0 ## 2, x4 ## 2; \ 478 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 479 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 480 vpxor RK3, x3 ## 2, x3 ## 2; \ 481 vpxor RK1, x1 ## 2, x1 ## 2; \ 482 vpsrld $22, x2 ## 2, x4 ## 2; \ 483 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 484 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 485 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 486 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 487 vpslld $7, x1 ## 1, x4 ## 1; \ 488 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 489 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 490 vpsrld $1, x1 ## 1, x4 ## 1; \ 491 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 492 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 493 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 494 vpslld $7, x1 ## 2, x4 ## 2; \ 495 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 496 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 497 vpsrld $1, x1 ## 2, x4 ## 2; \ 498 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 499 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 500 vpsrld $7, x3 ## 1, x4 ## 1; \ 501 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 502 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 503 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 504 vpslld $3, x0 ## 1, x4 ## 1; \ 505 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 506 vpsrld $7, x3 ## 2, x4 ## 2; \ 507 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 508 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 509 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 510 vpslld $3, x0 ## 2, x4 ## 2; \ 511 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 512 vpsrld $13, x0 ## 1, x4 ## 1; \ 513 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 514 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 515 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 516 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 517 vpsrld $3, x2 ## 1, x4 ## 1; \ 518 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 519 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 520 vpsrld $13, x0 ## 2, x4 ## 2; \ 521 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 522 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 523 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 524 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 525 vpsrld $3, x2 ## 2, x4 ## 2; \ 526 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 527 vpor x4 ## 2, x2 ## 2, x2 ## 2; 528 529#define S(SBOX, x0, x1, x2, x3, x4) \ 530 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 531 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 532 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 533 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 534 535#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 536 get_key(i, 0, RK0); \ 537 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 538 get_key(i, 2, RK2); \ 539 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 540 get_key(i, 3, RK3); \ 541 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 542 get_key(i, 1, RK1); \ 543 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 544 545#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 546 vpunpckldq x1, x0, t0; \ 547 vpunpckhdq x1, x0, t2; \ 548 vpunpckldq x3, x2, t1; \ 549 vpunpckhdq x3, x2, x3; \ 550 \ 551 vpunpcklqdq t1, t0, x0; \ 552 vpunpckhqdq t1, t0, x1; \ 553 vpunpcklqdq x3, t2, x2; \ 554 vpunpckhqdq x3, t2, x3; 555 556#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 557 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 558 559#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 560 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 561 562.align 8 563__serpent_enc_blk16: 564 /* input: 565 * %rdi: ctx, CTX 566 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext 567 * output: 568 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 569 */ 570 571 vpcmpeqd RNOT, RNOT, RNOT; 572 573 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 574 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 575 576 K2(RA, RB, RC, RD, RE, 0); 577 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 578 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 579 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 580 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 581 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 582 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 583 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 584 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 585 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 586 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 587 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 588 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 589 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 590 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 591 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 592 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 593 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 594 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 595 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 596 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 597 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 598 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 599 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 600 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 601 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 602 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 603 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 604 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 605 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 606 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 607 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 608 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 609 610 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 611 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 612 613 ret; 614ENDPROC(__serpent_enc_blk16) 615 616.align 8 617__serpent_dec_blk16: 618 /* input: 619 * %rdi: ctx, CTX 620 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 621 * output: 622 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext 623 */ 624 625 vpcmpeqd RNOT, RNOT, RNOT; 626 627 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 628 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 629 630 K2(RA, RB, RC, RD, RE, 32); 631 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 632 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 633 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 634 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 635 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 636 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 637 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 638 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 639 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 640 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 641 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 642 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 643 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 644 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 645 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 646 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 647 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 648 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 649 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 650 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 651 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 652 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 653 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 654 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 655 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 656 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 657 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 658 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 659 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 660 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 661 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 662 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 663 664 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 665 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 666 667 ret; 668ENDPROC(__serpent_dec_blk16) 669 670ENTRY(serpent_ecb_enc_16way) 671 /* input: 672 * %rdi: ctx, CTX 673 * %rsi: dst 674 * %rdx: src 675 */ 676 677 vzeroupper; 678 679 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 680 681 call __serpent_enc_blk16; 682 683 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 684 685 vzeroupper; 686 687 ret; 688ENDPROC(serpent_ecb_enc_16way) 689 690ENTRY(serpent_ecb_dec_16way) 691 /* input: 692 * %rdi: ctx, CTX 693 * %rsi: dst 694 * %rdx: src 695 */ 696 697 vzeroupper; 698 699 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 700 701 call __serpent_dec_blk16; 702 703 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 704 705 vzeroupper; 706 707 ret; 708ENDPROC(serpent_ecb_dec_16way) 709 710ENTRY(serpent_cbc_dec_16way) 711 /* input: 712 * %rdi: ctx, CTX 713 * %rsi: dst 714 * %rdx: src 715 */ 716 717 vzeroupper; 718 719 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 720 721 call __serpent_dec_blk16; 722 723 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, 724 RK0); 725 726 vzeroupper; 727 728 ret; 729ENDPROC(serpent_cbc_dec_16way) 730 731ENTRY(serpent_ctr_16way) 732 /* input: 733 * %rdi: ctx, CTX 734 * %rsi: dst (16 blocks) 735 * %rdx: src (16 blocks) 736 * %rcx: iv (little endian, 128bit) 737 */ 738 739 vzeroupper; 740 741 load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 742 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 743 tp); 744 745 call __serpent_enc_blk16; 746 747 store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 748 749 vzeroupper; 750 751 ret; 752ENDPROC(serpent_ctr_16way) 753 754ENTRY(serpent_xts_enc_16way) 755 /* input: 756 * %rdi: ctx, CTX 757 * %rsi: dst (16 blocks) 758 * %rdx: src (16 blocks) 759 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 760 */ 761 762 vzeroupper; 763 764 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 765 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 766 .Lxts_gf128mul_and_shl1_mask_0, 767 .Lxts_gf128mul_and_shl1_mask_1); 768 769 call __serpent_enc_blk16; 770 771 store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 772 773 vzeroupper; 774 775 ret; 776ENDPROC(serpent_xts_enc_16way) 777 778ENTRY(serpent_xts_dec_16way) 779 /* input: 780 * %rdi: ctx, CTX 781 * %rsi: dst (16 blocks) 782 * %rdx: src (16 blocks) 783 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 784 */ 785 786 vzeroupper; 787 788 load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 789 RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 790 .Lxts_gf128mul_and_shl1_mask_0, 791 .Lxts_gf128mul_and_shl1_mask_1); 792 793 call __serpent_dec_blk16; 794 795 store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 796 797 vzeroupper; 798 799 ret; 800ENDPROC(serpent_xts_dec_16way) 801