1/* 2 * Camellia Cipher Algorithm (x86_64) 3 * 4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23.file "camellia-x86_64-asm_64.S" 24.text 25 26.extern camellia_sp10011110; 27.extern camellia_sp22000222; 28.extern camellia_sp03303033; 29.extern camellia_sp00444404; 30.extern camellia_sp02220222; 31.extern camellia_sp30333033; 32.extern camellia_sp44044404; 33.extern camellia_sp11101110; 34 35#define sp10011110 camellia_sp10011110 36#define sp22000222 camellia_sp22000222 37#define sp03303033 camellia_sp03303033 38#define sp00444404 camellia_sp00444404 39#define sp02220222 camellia_sp02220222 40#define sp30333033 camellia_sp30333033 41#define sp44044404 camellia_sp44044404 42#define sp11101110 camellia_sp11101110 43 44#define CAMELLIA_TABLE_BYTE_LEN 272 45 46/* struct camellia_ctx: */ 47#define key_table 0 48#define key_length CAMELLIA_TABLE_BYTE_LEN 49 50/* register macros */ 51#define CTX %rdi 52#define RIO %rsi 53#define RIOd %esi 54 55#define RAB0 %rax 56#define RCD0 %rcx 57#define RAB1 %rbx 58#define RCD1 %rdx 59 60#define RAB0d %eax 61#define RCD0d %ecx 62#define RAB1d %ebx 63#define RCD1d %edx 64 65#define RAB0bl %al 66#define RCD0bl %cl 67#define RAB1bl %bl 68#define RCD1bl %dl 69 70#define RAB0bh %ah 71#define RCD0bh %ch 72#define RAB1bh %bh 73#define RCD1bh %dh 74 75#define RT0 %rsi 76#define RT1 %rbp 77#define RT2 %r8 78 79#define RT0d %esi 80#define RT1d %ebp 81#define RT2d %r8d 82 83#define RT2bl %r8b 84 85#define RXOR %r9 86#define RRBP %r10 87#define RDST %r11 88 89#define RXORd %r9d 90#define RXORbl %r9b 91 92#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ 93 movzbl ab ## bl, tmp2 ## d; \ 94 movzbl ab ## bh, tmp1 ## d; \ 95 rorq $16, ab; \ 96 xorq T0(, tmp2, 8), dst; \ 97 xorq T1(, tmp1, 8), dst; 98 99/********************************************************************** 100 1-way camellia 101 **********************************************************************/ 102#define roundsm(ab, subkey, cd) \ 103 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 104 \ 105 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 106 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 107 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 108 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 109 \ 110 xorq RT2, cd ## 0; 111 112#define fls(l, r, kl, kr) \ 113 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 114 andl l ## 0d, RT0d; \ 115 roll $1, RT0d; \ 116 shlq $32, RT0; \ 117 xorq RT0, l ## 0; \ 118 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 119 orq r ## 0, RT1; \ 120 shrq $32, RT1; \ 121 xorq RT1, r ## 0; \ 122 \ 123 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ 124 orq l ## 0, RT2; \ 125 shrq $32, RT2; \ 126 xorq RT2, l ## 0; \ 127 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ 128 andl r ## 0d, RT0d; \ 129 roll $1, RT0d; \ 130 shlq $32, RT0; \ 131 xorq RT0, r ## 0; 132 133#define enc_rounds(i) \ 134 roundsm(RAB, i + 2, RCD); \ 135 roundsm(RCD, i + 3, RAB); \ 136 roundsm(RAB, i + 4, RCD); \ 137 roundsm(RCD, i + 5, RAB); \ 138 roundsm(RAB, i + 6, RCD); \ 139 roundsm(RCD, i + 7, RAB); 140 141#define enc_fls(i) \ 142 fls(RAB, RCD, i + 0, i + 1); 143 144#define enc_inpack() \ 145 movq (RIO), RAB0; \ 146 bswapq RAB0; \ 147 rolq $32, RAB0; \ 148 movq 4*2(RIO), RCD0; \ 149 bswapq RCD0; \ 150 rorq $32, RCD0; \ 151 xorq key_table(CTX), RAB0; 152 153#define enc_outunpack(op, max) \ 154 xorq key_table(CTX, max, 8), RCD0; \ 155 rorq $32, RCD0; \ 156 bswapq RCD0; \ 157 op ## q RCD0, (RIO); \ 158 rolq $32, RAB0; \ 159 bswapq RAB0; \ 160 op ## q RAB0, 4*2(RIO); 161 162#define dec_rounds(i) \ 163 roundsm(RAB, i + 7, RCD); \ 164 roundsm(RCD, i + 6, RAB); \ 165 roundsm(RAB, i + 5, RCD); \ 166 roundsm(RCD, i + 4, RAB); \ 167 roundsm(RAB, i + 3, RCD); \ 168 roundsm(RCD, i + 2, RAB); 169 170#define dec_fls(i) \ 171 fls(RAB, RCD, i + 1, i + 0); 172 173#define dec_inpack(max) \ 174 movq (RIO), RAB0; \ 175 bswapq RAB0; \ 176 rolq $32, RAB0; \ 177 movq 4*2(RIO), RCD0; \ 178 bswapq RCD0; \ 179 rorq $32, RCD0; \ 180 xorq key_table(CTX, max, 8), RAB0; 181 182#define dec_outunpack() \ 183 xorq key_table(CTX), RCD0; \ 184 rorq $32, RCD0; \ 185 bswapq RCD0; \ 186 movq RCD0, (RIO); \ 187 rolq $32, RAB0; \ 188 bswapq RAB0; \ 189 movq RAB0, 4*2(RIO); 190 191.global __camellia_enc_blk; 192.type __camellia_enc_blk,@function; 193 194__camellia_enc_blk: 195 /* input: 196 * %rdi: ctx, CTX 197 * %rsi: dst 198 * %rdx: src 199 * %rcx: bool xor 200 */ 201 movq %rbp, RRBP; 202 203 movq %rcx, RXOR; 204 movq %rsi, RDST; 205 movq %rdx, RIO; 206 207 enc_inpack(); 208 209 enc_rounds(0); 210 enc_fls(8); 211 enc_rounds(8); 212 enc_fls(16); 213 enc_rounds(16); 214 movl $24, RT1d; /* max */ 215 216 cmpb $16, key_length(CTX); 217 je __enc_done; 218 219 enc_fls(24); 220 enc_rounds(24); 221 movl $32, RT1d; /* max */ 222 223__enc_done: 224 testb RXORbl, RXORbl; 225 movq RDST, RIO; 226 227 jnz __enc_xor; 228 229 enc_outunpack(mov, RT1); 230 231 movq RRBP, %rbp; 232 ret; 233 234__enc_xor: 235 enc_outunpack(xor, RT1); 236 237 movq RRBP, %rbp; 238 ret; 239 240.global camellia_dec_blk; 241.type camellia_dec_blk,@function; 242 243camellia_dec_blk: 244 /* input: 245 * %rdi: ctx, CTX 246 * %rsi: dst 247 * %rdx: src 248 */ 249 cmpl $16, key_length(CTX); 250 movl $32, RT2d; 251 movl $24, RXORd; 252 cmovel RXORd, RT2d; /* max */ 253 254 movq %rbp, RRBP; 255 movq %rsi, RDST; 256 movq %rdx, RIO; 257 258 dec_inpack(RT2); 259 260 cmpb $24, RT2bl; 261 je __dec_rounds16; 262 263 dec_rounds(24); 264 dec_fls(24); 265 266__dec_rounds16: 267 dec_rounds(16); 268 dec_fls(16); 269 dec_rounds(8); 270 dec_fls(8); 271 dec_rounds(0); 272 273 movq RDST, RIO; 274 275 dec_outunpack(); 276 277 movq RRBP, %rbp; 278 ret; 279 280/********************************************************************** 281 2-way camellia 282 **********************************************************************/ 283#define roundsm2(ab, subkey, cd) \ 284 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 285 xorq RT2, cd ## 1; \ 286 \ 287 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 288 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 289 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 290 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 291 \ 292 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ 293 xorq RT2, cd ## 0; \ 294 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ 295 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ 296 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); 297 298#define fls2(l, r, kl, kr) \ 299 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 300 andl l ## 0d, RT0d; \ 301 roll $1, RT0d; \ 302 shlq $32, RT0; \ 303 xorq RT0, l ## 0; \ 304 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 305 orq r ## 0, RT1; \ 306 shrq $32, RT1; \ 307 xorq RT1, r ## 0; \ 308 \ 309 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ 310 andl l ## 1d, RT2d; \ 311 roll $1, RT2d; \ 312 shlq $32, RT2; \ 313 xorq RT2, l ## 1; \ 314 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ 315 orq r ## 1, RT0; \ 316 shrq $32, RT0; \ 317 xorq RT0, r ## 1; \ 318 \ 319 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ 320 orq l ## 0, RT1; \ 321 shrq $32, RT1; \ 322 xorq RT1, l ## 0; \ 323 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ 324 andl r ## 0d, RT2d; \ 325 roll $1, RT2d; \ 326 shlq $32, RT2; \ 327 xorq RT2, r ## 0; \ 328 \ 329 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ 330 orq l ## 1, RT0; \ 331 shrq $32, RT0; \ 332 xorq RT0, l ## 1; \ 333 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ 334 andl r ## 1d, RT1d; \ 335 roll $1, RT1d; \ 336 shlq $32, RT1; \ 337 xorq RT1, r ## 1; 338 339#define enc_rounds2(i) \ 340 roundsm2(RAB, i + 2, RCD); \ 341 roundsm2(RCD, i + 3, RAB); \ 342 roundsm2(RAB, i + 4, RCD); \ 343 roundsm2(RCD, i + 5, RAB); \ 344 roundsm2(RAB, i + 6, RCD); \ 345 roundsm2(RCD, i + 7, RAB); 346 347#define enc_fls2(i) \ 348 fls2(RAB, RCD, i + 0, i + 1); 349 350#define enc_inpack2() \ 351 movq (RIO), RAB0; \ 352 bswapq RAB0; \ 353 rorq $32, RAB0; \ 354 movq 4*2(RIO), RCD0; \ 355 bswapq RCD0; \ 356 rolq $32, RCD0; \ 357 xorq key_table(CTX), RAB0; \ 358 \ 359 movq 8*2(RIO), RAB1; \ 360 bswapq RAB1; \ 361 rorq $32, RAB1; \ 362 movq 12*2(RIO), RCD1; \ 363 bswapq RCD1; \ 364 rolq $32, RCD1; \ 365 xorq key_table(CTX), RAB1; 366 367#define enc_outunpack2(op, max) \ 368 xorq key_table(CTX, max, 8), RCD0; \ 369 rolq $32, RCD0; \ 370 bswapq RCD0; \ 371 op ## q RCD0, (RIO); \ 372 rorq $32, RAB0; \ 373 bswapq RAB0; \ 374 op ## q RAB0, 4*2(RIO); \ 375 \ 376 xorq key_table(CTX, max, 8), RCD1; \ 377 rolq $32, RCD1; \ 378 bswapq RCD1; \ 379 op ## q RCD1, 8*2(RIO); \ 380 rorq $32, RAB1; \ 381 bswapq RAB1; \ 382 op ## q RAB1, 12*2(RIO); 383 384#define dec_rounds2(i) \ 385 roundsm2(RAB, i + 7, RCD); \ 386 roundsm2(RCD, i + 6, RAB); \ 387 roundsm2(RAB, i + 5, RCD); \ 388 roundsm2(RCD, i + 4, RAB); \ 389 roundsm2(RAB, i + 3, RCD); \ 390 roundsm2(RCD, i + 2, RAB); 391 392#define dec_fls2(i) \ 393 fls2(RAB, RCD, i + 1, i + 0); 394 395#define dec_inpack2(max) \ 396 movq (RIO), RAB0; \ 397 bswapq RAB0; \ 398 rorq $32, RAB0; \ 399 movq 4*2(RIO), RCD0; \ 400 bswapq RCD0; \ 401 rolq $32, RCD0; \ 402 xorq key_table(CTX, max, 8), RAB0; \ 403 \ 404 movq 8*2(RIO), RAB1; \ 405 bswapq RAB1; \ 406 rorq $32, RAB1; \ 407 movq 12*2(RIO), RCD1; \ 408 bswapq RCD1; \ 409 rolq $32, RCD1; \ 410 xorq key_table(CTX, max, 8), RAB1; 411 412#define dec_outunpack2() \ 413 xorq key_table(CTX), RCD0; \ 414 rolq $32, RCD0; \ 415 bswapq RCD0; \ 416 movq RCD0, (RIO); \ 417 rorq $32, RAB0; \ 418 bswapq RAB0; \ 419 movq RAB0, 4*2(RIO); \ 420 \ 421 xorq key_table(CTX), RCD1; \ 422 rolq $32, RCD1; \ 423 bswapq RCD1; \ 424 movq RCD1, 8*2(RIO); \ 425 rorq $32, RAB1; \ 426 bswapq RAB1; \ 427 movq RAB1, 12*2(RIO); 428 429.global __camellia_enc_blk_2way; 430.type __camellia_enc_blk_2way,@function; 431 432__camellia_enc_blk_2way: 433 /* input: 434 * %rdi: ctx, CTX 435 * %rsi: dst 436 * %rdx: src 437 * %rcx: bool xor 438 */ 439 pushq %rbx; 440 441 movq %rbp, RRBP; 442 movq %rcx, RXOR; 443 movq %rsi, RDST; 444 movq %rdx, RIO; 445 446 enc_inpack2(); 447 448 enc_rounds2(0); 449 enc_fls2(8); 450 enc_rounds2(8); 451 enc_fls2(16); 452 enc_rounds2(16); 453 movl $24, RT2d; /* max */ 454 455 cmpb $16, key_length(CTX); 456 je __enc2_done; 457 458 enc_fls2(24); 459 enc_rounds2(24); 460 movl $32, RT2d; /* max */ 461 462__enc2_done: 463 test RXORbl, RXORbl; 464 movq RDST, RIO; 465 jnz __enc2_xor; 466 467 enc_outunpack2(mov, RT2); 468 469 movq RRBP, %rbp; 470 popq %rbx; 471 ret; 472 473__enc2_xor: 474 enc_outunpack2(xor, RT2); 475 476 movq RRBP, %rbp; 477 popq %rbx; 478 ret; 479 480.global camellia_dec_blk_2way; 481.type camellia_dec_blk_2way,@function; 482 483camellia_dec_blk_2way: 484 /* input: 485 * %rdi: ctx, CTX 486 * %rsi: dst 487 * %rdx: src 488 */ 489 cmpl $16, key_length(CTX); 490 movl $32, RT2d; 491 movl $24, RXORd; 492 cmovel RXORd, RT2d; /* max */ 493 494 movq %rbx, RXOR; 495 movq %rbp, RRBP; 496 movq %rsi, RDST; 497 movq %rdx, RIO; 498 499 dec_inpack2(RT2); 500 501 cmpb $24, RT2bl; 502 je __dec2_rounds16; 503 504 dec_rounds2(24); 505 dec_fls2(24); 506 507__dec2_rounds16: 508 dec_rounds2(16); 509 dec_fls2(16); 510 dec_rounds2(8); 511 dec_fls2(8); 512 dec_rounds2(0); 513 514 movq RDST, RIO; 515 516 dec_outunpack2(); 517 518 movq RRBP, %rbp; 519 movq RXOR, %rbx; 520 ret; 521