1/* 2 * Twofish Cipher 3-way parallel algorithm (x86_64) 3 * 4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 19 * USA 20 * 21 */ 22 23#include <linux/linkage.h> 24 25.file "twofish-x86_64-asm-3way.S" 26.text 27 28/* structure of crypto context */ 29#define s0 0 30#define s1 1024 31#define s2 2048 32#define s3 3072 33#define w 4096 34#define k 4128 35 36/********************************************************************** 37 3-way twofish 38 **********************************************************************/ 39#define CTX %rdi 40#define RIO %rdx 41 42#define RAB0 %rax 43#define RAB1 %rbx 44#define RAB2 %rcx 45 46#define RAB0d %eax 47#define RAB1d %ebx 48#define RAB2d %ecx 49 50#define RAB0bh %ah 51#define RAB1bh %bh 52#define RAB2bh %ch 53 54#define RAB0bl %al 55#define RAB1bl %bl 56#define RAB2bl %cl 57 58#define CD0 0x0(%rsp) 59#define CD1 0x8(%rsp) 60#define CD2 0x10(%rsp) 61 62# used only before/after all rounds 63#define RCD0 %r8 64#define RCD1 %r9 65#define RCD2 %r10 66 67# used only during rounds 68#define RX0 %r8 69#define RX1 %r9 70#define RX2 %r10 71 72#define RX0d %r8d 73#define RX1d %r9d 74#define RX2d %r10d 75 76#define RY0 %r11 77#define RY1 %r12 78#define RY2 %r13 79 80#define RY0d %r11d 81#define RY1d %r12d 82#define RY2d %r13d 83 84#define RT0 %rdx 85#define RT1 %rsi 86 87#define RT0d %edx 88#define RT1d %esi 89 90#define RT1bl %sil 91 92#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ 93 movzbl ab ## bl, tmp2 ## d; \ 94 movzbl ab ## bh, tmp1 ## d; \ 95 rorq $(rot), ab; \ 96 op1##l T0(CTX, tmp2, 4), dst ## d; \ 97 op2##l T1(CTX, tmp1, 4), dst ## d; 98 99#define swap_ab_with_cd(ab, cd, tmp) \ 100 movq cd, tmp; \ 101 movq ab, cd; \ 102 movq tmp, ab; 103 104/* 105 * Combined G1 & G2 function. Reordered with help of rotates to have moves 106 * at begining. 107 */ 108#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ 109 /* G1,1 && G2,1 */ \ 110 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ 111 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ 112 \ 113 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ 114 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ 115 \ 116 do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ 117 do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ 118 \ 119 /* G1,2 && G2,2 */ \ 120 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ 121 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ 122 swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \ 123 \ 124 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ 125 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ 126 swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \ 127 \ 128 do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ 129 do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ 130 swap_ab_with_cd(ab ## 2, cd ## 2, RT0); 131 132#define enc_round_end(ab, x, y, n) \ 133 addl y ## d, x ## d; \ 134 addl x ## d, y ## d; \ 135 addl k+4*(2*(n))(CTX), x ## d; \ 136 xorl ab ## d, x ## d; \ 137 addl k+4*(2*(n)+1)(CTX), y ## d; \ 138 shrq $32, ab; \ 139 roll $1, ab ## d; \ 140 xorl y ## d, ab ## d; \ 141 shlq $32, ab; \ 142 rorl $1, x ## d; \ 143 orq x, ab; 144 145#define dec_round_end(ba, x, y, n) \ 146 addl y ## d, x ## d; \ 147 addl x ## d, y ## d; \ 148 addl k+4*(2*(n))(CTX), x ## d; \ 149 addl k+4*(2*(n)+1)(CTX), y ## d; \ 150 xorl ba ## d, y ## d; \ 151 shrq $32, ba; \ 152 roll $1, ba ## d; \ 153 xorl x ## d, ba ## d; \ 154 shlq $32, ba; \ 155 rorl $1, y ## d; \ 156 orq y, ba; 157 158#define encrypt_round3(ab, cd, n) \ 159 g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ 160 \ 161 enc_round_end(ab ## 0, RX0, RY0, n); \ 162 enc_round_end(ab ## 1, RX1, RY1, n); \ 163 enc_round_end(ab ## 2, RX2, RY2, n); 164 165#define decrypt_round3(ba, dc, n) \ 166 g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ 167 \ 168 dec_round_end(ba ## 0, RX0, RY0, n); \ 169 dec_round_end(ba ## 1, RX1, RY1, n); \ 170 dec_round_end(ba ## 2, RX2, RY2, n); 171 172#define encrypt_cycle3(ab, cd, n) \ 173 encrypt_round3(ab, cd, n*2); \ 174 encrypt_round3(ab, cd, (n*2)+1); 175 176#define decrypt_cycle3(ba, dc, n) \ 177 decrypt_round3(ba, dc, (n*2)+1); \ 178 decrypt_round3(ba, dc, (n*2)); 179 180#define push_cd() \ 181 pushq RCD2; \ 182 pushq RCD1; \ 183 pushq RCD0; 184 185#define pop_cd() \ 186 popq RCD0; \ 187 popq RCD1; \ 188 popq RCD2; 189 190#define inpack3(in, n, xy, m) \ 191 movq 4*(n)(in), xy ## 0; \ 192 xorq w+4*m(CTX), xy ## 0; \ 193 \ 194 movq 4*(4+(n))(in), xy ## 1; \ 195 xorq w+4*m(CTX), xy ## 1; \ 196 \ 197 movq 4*(8+(n))(in), xy ## 2; \ 198 xorq w+4*m(CTX), xy ## 2; 199 200#define outunpack3(op, out, n, xy, m) \ 201 xorq w+4*m(CTX), xy ## 0; \ 202 op ## q xy ## 0, 4*(n)(out); \ 203 \ 204 xorq w+4*m(CTX), xy ## 1; \ 205 op ## q xy ## 1, 4*(4+(n))(out); \ 206 \ 207 xorq w+4*m(CTX), xy ## 2; \ 208 op ## q xy ## 2, 4*(8+(n))(out); 209 210#define inpack_enc3() \ 211 inpack3(RIO, 0, RAB, 0); \ 212 inpack3(RIO, 2, RCD, 2); 213 214#define outunpack_enc3(op) \ 215 outunpack3(op, RIO, 2, RAB, 6); \ 216 outunpack3(op, RIO, 0, RCD, 4); 217 218#define inpack_dec3() \ 219 inpack3(RIO, 0, RAB, 4); \ 220 rorq $32, RAB0; \ 221 rorq $32, RAB1; \ 222 rorq $32, RAB2; \ 223 inpack3(RIO, 2, RCD, 6); \ 224 rorq $32, RCD0; \ 225 rorq $32, RCD1; \ 226 rorq $32, RCD2; 227 228#define outunpack_dec3() \ 229 rorq $32, RCD0; \ 230 rorq $32, RCD1; \ 231 rorq $32, RCD2; \ 232 outunpack3(mov, RIO, 0, RCD, 0); \ 233 rorq $32, RAB0; \ 234 rorq $32, RAB1; \ 235 rorq $32, RAB2; \ 236 outunpack3(mov, RIO, 2, RAB, 2); 237 238ENTRY(__twofish_enc_blk_3way) 239 /* input: 240 * %rdi: ctx, CTX 241 * %rsi: dst 242 * %rdx: src, RIO 243 * %rcx: bool, if true: xor output 244 */ 245 pushq %r13; 246 pushq %r12; 247 pushq %rbx; 248 249 pushq %rcx; /* bool xor */ 250 pushq %rsi; /* dst */ 251 252 inpack_enc3(); 253 254 push_cd(); 255 encrypt_cycle3(RAB, CD, 0); 256 encrypt_cycle3(RAB, CD, 1); 257 encrypt_cycle3(RAB, CD, 2); 258 encrypt_cycle3(RAB, CD, 3); 259 encrypt_cycle3(RAB, CD, 4); 260 encrypt_cycle3(RAB, CD, 5); 261 encrypt_cycle3(RAB, CD, 6); 262 encrypt_cycle3(RAB, CD, 7); 263 pop_cd(); 264 265 popq RIO; /* dst */ 266 popq RT1; /* bool xor */ 267 268 testb RT1bl, RT1bl; 269 jnz .L__enc_xor3; 270 271 outunpack_enc3(mov); 272 273 popq %rbx; 274 popq %r12; 275 popq %r13; 276 ret; 277 278.L__enc_xor3: 279 outunpack_enc3(xor); 280 281 popq %rbx; 282 popq %r12; 283 popq %r13; 284 ret; 285ENDPROC(__twofish_enc_blk_3way) 286 287ENTRY(twofish_dec_blk_3way) 288 /* input: 289 * %rdi: ctx, CTX 290 * %rsi: dst 291 * %rdx: src, RIO 292 */ 293 pushq %r13; 294 pushq %r12; 295 pushq %rbx; 296 297 pushq %rsi; /* dst */ 298 299 inpack_dec3(); 300 301 push_cd(); 302 decrypt_cycle3(RAB, CD, 7); 303 decrypt_cycle3(RAB, CD, 6); 304 decrypt_cycle3(RAB, CD, 5); 305 decrypt_cycle3(RAB, CD, 4); 306 decrypt_cycle3(RAB, CD, 3); 307 decrypt_cycle3(RAB, CD, 2); 308 decrypt_cycle3(RAB, CD, 1); 309 decrypt_cycle3(RAB, CD, 0); 310 pop_cd(); 311 312 popq RIO; /* dst */ 313 314 outunpack_dec3(); 315 316 popq %rbx; 317 popq %r12; 318 popq %r13; 319 ret; 320ENDPROC(twofish_dec_blk_3way) 321