1// SPDX-License-Identifier: GPL-2.0 2/* 3 * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS 4 * 5 * Copyright (c) 2018 Google, Inc 6 * 7 * Author: Eric Biggers <ebiggers@google.com> 8 */ 9 10#include <linux/linkage.h> 11 12 .text 13 14 // arguments 15 ROUND_KEYS .req x0 // const {u64,u32} *round_keys 16 NROUNDS .req w1 // int nrounds 17 NROUNDS_X .req x1 18 DST .req x2 // void *dst 19 SRC .req x3 // const void *src 20 NBYTES .req w4 // unsigned int nbytes 21 TWEAK .req x5 // void *tweak 22 23 // registers which hold the data being encrypted/decrypted 24 // (underscores avoid a naming collision with ARM64 registers x0-x3) 25 X_0 .req v0 26 Y_0 .req v1 27 X_1 .req v2 28 Y_1 .req v3 29 X_2 .req v4 30 Y_2 .req v5 31 X_3 .req v6 32 Y_3 .req v7 33 34 // the round key, duplicated in all lanes 35 ROUND_KEY .req v8 36 37 // index vector for tbl-based 8-bit rotates 38 ROTATE_TABLE .req v9 39 ROTATE_TABLE_Q .req q9 40 41 // temporary registers 42 TMP0 .req v10 43 TMP1 .req v11 44 TMP2 .req v12 45 TMP3 .req v13 46 47 // multiplication table for updating XTS tweaks 48 GFMUL_TABLE .req v14 49 GFMUL_TABLE_Q .req q14 50 51 // next XTS tweak value(s) 52 TWEAKV_NEXT .req v15 53 54 // XTS tweaks for the blocks currently being encrypted/decrypted 55 TWEAKV0 .req v16 56 TWEAKV1 .req v17 57 TWEAKV2 .req v18 58 TWEAKV3 .req v19 59 TWEAKV4 .req v20 60 TWEAKV5 .req v21 61 TWEAKV6 .req v22 62 TWEAKV7 .req v23 63 64 .align 4 65.Lror64_8_table: 66 .octa 0x080f0e0d0c0b0a090007060504030201 67.Lror32_8_table: 68 .octa 0x0c0f0e0d080b0a090407060500030201 69.Lrol64_8_table: 70 .octa 0x0e0d0c0b0a09080f0605040302010007 71.Lrol32_8_table: 72 .octa 0x0e0d0c0f0a09080b0605040702010003 73.Lgf128mul_table: 74 .octa 0x00000000000000870000000000000001 75.Lgf64mul_table: 76 .octa 0x0000000000000000000000002d361b00 77 78/* 79 * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time 80 * 81 * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for 82 * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes 83 * of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64. 84 * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64. 85 */ 86.macro _speck_round_128bytes n, lanes 87 88 // x = ror(x, 8) 89 tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b 90 tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b 91 tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b 92 tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b 93 94 // x += y 95 add X_0.\lanes, X_0.\lanes, Y_0.\lanes 96 add X_1.\lanes, X_1.\lanes, Y_1.\lanes 97 add X_2.\lanes, X_2.\lanes, Y_2.\lanes 98 add X_3.\lanes, X_3.\lanes, Y_3.\lanes 99 100 // x ^= k 101 eor X_0.16b, X_0.16b, ROUND_KEY.16b 102 eor X_1.16b, X_1.16b, ROUND_KEY.16b 103 eor X_2.16b, X_2.16b, ROUND_KEY.16b 104 eor X_3.16b, X_3.16b, ROUND_KEY.16b 105 106 // y = rol(y, 3) 107 shl TMP0.\lanes, Y_0.\lanes, #3 108 shl TMP1.\lanes, Y_1.\lanes, #3 109 shl TMP2.\lanes, Y_2.\lanes, #3 110 shl TMP3.\lanes, Y_3.\lanes, #3 111 sri TMP0.\lanes, Y_0.\lanes, #(\n - 3) 112 sri TMP1.\lanes, Y_1.\lanes, #(\n - 3) 113 sri TMP2.\lanes, Y_2.\lanes, #(\n - 3) 114 sri TMP3.\lanes, Y_3.\lanes, #(\n - 3) 115 116 // y ^= x 117 eor Y_0.16b, TMP0.16b, X_0.16b 118 eor Y_1.16b, TMP1.16b, X_1.16b 119 eor Y_2.16b, TMP2.16b, X_2.16b 120 eor Y_3.16b, TMP3.16b, X_3.16b 121.endm 122 123/* 124 * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time 125 * 126 * This is the inverse of _speck_round_128bytes(). 127 */ 128.macro _speck_unround_128bytes n, lanes 129 130 // y ^= x 131 eor TMP0.16b, Y_0.16b, X_0.16b 132 eor TMP1.16b, Y_1.16b, X_1.16b 133 eor TMP2.16b, Y_2.16b, X_2.16b 134 eor TMP3.16b, Y_3.16b, X_3.16b 135 136 // y = ror(y, 3) 137 ushr Y_0.\lanes, TMP0.\lanes, #3 138 ushr Y_1.\lanes, TMP1.\lanes, #3 139 ushr Y_2.\lanes, TMP2.\lanes, #3 140 ushr Y_3.\lanes, TMP3.\lanes, #3 141 sli Y_0.\lanes, TMP0.\lanes, #(\n - 3) 142 sli Y_1.\lanes, TMP1.\lanes, #(\n - 3) 143 sli Y_2.\lanes, TMP2.\lanes, #(\n - 3) 144 sli Y_3.\lanes, TMP3.\lanes, #(\n - 3) 145 146 // x ^= k 147 eor X_0.16b, X_0.16b, ROUND_KEY.16b 148 eor X_1.16b, X_1.16b, ROUND_KEY.16b 149 eor X_2.16b, X_2.16b, ROUND_KEY.16b 150 eor X_3.16b, X_3.16b, ROUND_KEY.16b 151 152 // x -= y 153 sub X_0.\lanes, X_0.\lanes, Y_0.\lanes 154 sub X_1.\lanes, X_1.\lanes, Y_1.\lanes 155 sub X_2.\lanes, X_2.\lanes, Y_2.\lanes 156 sub X_3.\lanes, X_3.\lanes, Y_3.\lanes 157 158 // x = rol(x, 8) 159 tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b 160 tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b 161 tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b 162 tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b 163.endm 164 165.macro _next_xts_tweak next, cur, tmp, n 166.if \n == 64 167 /* 168 * Calculate the next tweak by multiplying the current one by x, 169 * modulo p(x) = x^128 + x^7 + x^2 + x + 1. 170 */ 171 sshr \tmp\().2d, \cur\().2d, #63 172 and \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b 173 shl \next\().2d, \cur\().2d, #1 174 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 175 eor \next\().16b, \next\().16b, \tmp\().16b 176.else 177 /* 178 * Calculate the next two tweaks by multiplying the current ones by x^2, 179 * modulo p(x) = x^64 + x^4 + x^3 + x + 1. 180 */ 181 ushr \tmp\().2d, \cur\().2d, #62 182 shl \next\().2d, \cur\().2d, #2 183 tbl \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b 184 eor \next\().16b, \next\().16b, \tmp\().16b 185.endif 186.endm 187 188/* 189 * _speck_xts_crypt() - Speck-XTS encryption/decryption 190 * 191 * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer 192 * using Speck-XTS, specifically the variant with a block size of '2n' and round 193 * count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and 194 * the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a 195 * nonzero multiple of 128. 196 */ 197.macro _speck_xts_crypt n, lanes, decrypting 198 199 /* 200 * If decrypting, modify the ROUND_KEYS parameter to point to the last 201 * round key rather than the first, since for decryption the round keys 202 * are used in reverse order. 203 */ 204.if \decrypting 205 mov NROUNDS, NROUNDS /* zero the high 32 bits */ 206.if \n == 64 207 add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3 208 sub ROUND_KEYS, ROUND_KEYS, #8 209.else 210 add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2 211 sub ROUND_KEYS, ROUND_KEYS, #4 212.endif 213.endif 214 215 // Load the index vector for tbl-based 8-bit rotates 216.if \decrypting 217 ldr ROTATE_TABLE_Q, .Lrol\n\()_8_table 218.else 219 ldr ROTATE_TABLE_Q, .Lror\n\()_8_table 220.endif 221 222 // One-time XTS preparation 223.if \n == 64 224 // Load first tweak 225 ld1 {TWEAKV0.16b}, [TWEAK] 226 227 // Load GF(2^128) multiplication table 228 ldr GFMUL_TABLE_Q, .Lgf128mul_table 229.else 230 // Load first tweak 231 ld1 {TWEAKV0.8b}, [TWEAK] 232 233 // Load GF(2^64) multiplication table 234 ldr GFMUL_TABLE_Q, .Lgf64mul_table 235 236 // Calculate second tweak, packing it together with the first 237 ushr TMP0.2d, TWEAKV0.2d, #63 238 shl TMP1.2d, TWEAKV0.2d, #1 239 tbl TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b 240 eor TMP0.8b, TMP0.8b, TMP1.8b 241 mov TWEAKV0.d[1], TMP0.d[0] 242.endif 243 244.Lnext_128bytes_\@: 245 246 // Calculate XTS tweaks for next 128 bytes 247 _next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n 248 _next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n 249 _next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n 250 _next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n 251 _next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n 252 _next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n 253 _next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n 254 _next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n 255 256 // Load the next source blocks into {X,Y}[0-3] 257 ld1 {X_0.16b-Y_1.16b}, [SRC], #64 258 ld1 {X_2.16b-Y_3.16b}, [SRC], #64 259 260 // XOR the source blocks with their XTS tweaks 261 eor TMP0.16b, X_0.16b, TWEAKV0.16b 262 eor Y_0.16b, Y_0.16b, TWEAKV1.16b 263 eor TMP1.16b, X_1.16b, TWEAKV2.16b 264 eor Y_1.16b, Y_1.16b, TWEAKV3.16b 265 eor TMP2.16b, X_2.16b, TWEAKV4.16b 266 eor Y_2.16b, Y_2.16b, TWEAKV5.16b 267 eor TMP3.16b, X_3.16b, TWEAKV6.16b 268 eor Y_3.16b, Y_3.16b, TWEAKV7.16b 269 270 /* 271 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so 272 * that the X[0-3] registers contain only the second halves of blocks, 273 * and the Y[0-3] registers contain only the first halves of blocks. 274 * (Speck uses the order (y, x) rather than the more intuitive (x, y).) 275 */ 276 uzp2 X_0.\lanes, TMP0.\lanes, Y_0.\lanes 277 uzp1 Y_0.\lanes, TMP0.\lanes, Y_0.\lanes 278 uzp2 X_1.\lanes, TMP1.\lanes, Y_1.\lanes 279 uzp1 Y_1.\lanes, TMP1.\lanes, Y_1.\lanes 280 uzp2 X_2.\lanes, TMP2.\lanes, Y_2.\lanes 281 uzp1 Y_2.\lanes, TMP2.\lanes, Y_2.\lanes 282 uzp2 X_3.\lanes, TMP3.\lanes, Y_3.\lanes 283 uzp1 Y_3.\lanes, TMP3.\lanes, Y_3.\lanes 284 285 // Do the cipher rounds 286 mov x6, ROUND_KEYS 287 mov w7, NROUNDS 288.Lnext_round_\@: 289.if \decrypting 290 ld1r {ROUND_KEY.\lanes}, [x6] 291 sub x6, x6, #( \n / 8 ) 292 _speck_unround_128bytes \n, \lanes 293.else 294 ld1r {ROUND_KEY.\lanes}, [x6], #( \n / 8 ) 295 _speck_round_128bytes \n, \lanes 296.endif 297 subs w7, w7, #1 298 bne .Lnext_round_\@ 299 300 // Re-interleave the 'x' and 'y' elements of each block 301 zip1 TMP0.\lanes, Y_0.\lanes, X_0.\lanes 302 zip2 Y_0.\lanes, Y_0.\lanes, X_0.\lanes 303 zip1 TMP1.\lanes, Y_1.\lanes, X_1.\lanes 304 zip2 Y_1.\lanes, Y_1.\lanes, X_1.\lanes 305 zip1 TMP2.\lanes, Y_2.\lanes, X_2.\lanes 306 zip2 Y_2.\lanes, Y_2.\lanes, X_2.\lanes 307 zip1 TMP3.\lanes, Y_3.\lanes, X_3.\lanes 308 zip2 Y_3.\lanes, Y_3.\lanes, X_3.\lanes 309 310 // XOR the encrypted/decrypted blocks with the tweaks calculated earlier 311 eor X_0.16b, TMP0.16b, TWEAKV0.16b 312 eor Y_0.16b, Y_0.16b, TWEAKV1.16b 313 eor X_1.16b, TMP1.16b, TWEAKV2.16b 314 eor Y_1.16b, Y_1.16b, TWEAKV3.16b 315 eor X_2.16b, TMP2.16b, TWEAKV4.16b 316 eor Y_2.16b, Y_2.16b, TWEAKV5.16b 317 eor X_3.16b, TMP3.16b, TWEAKV6.16b 318 eor Y_3.16b, Y_3.16b, TWEAKV7.16b 319 mov TWEAKV0.16b, TWEAKV_NEXT.16b 320 321 // Store the ciphertext in the destination buffer 322 st1 {X_0.16b-Y_1.16b}, [DST], #64 323 st1 {X_2.16b-Y_3.16b}, [DST], #64 324 325 // Continue if there are more 128-byte chunks remaining 326 subs NBYTES, NBYTES, #128 327 bne .Lnext_128bytes_\@ 328 329 // Store the next tweak and return 330.if \n == 64 331 st1 {TWEAKV_NEXT.16b}, [TWEAK] 332.else 333 st1 {TWEAKV_NEXT.8b}, [TWEAK] 334.endif 335 ret 336.endm 337 338ENTRY(speck128_xts_encrypt_neon) 339 _speck_xts_crypt n=64, lanes=2d, decrypting=0 340ENDPROC(speck128_xts_encrypt_neon) 341 342ENTRY(speck128_xts_decrypt_neon) 343 _speck_xts_crypt n=64, lanes=2d, decrypting=1 344ENDPROC(speck128_xts_decrypt_neon) 345 346ENTRY(speck64_xts_encrypt_neon) 347 _speck_xts_crypt n=32, lanes=4s, decrypting=0 348ENDPROC(speck64_xts_encrypt_neon) 349 350ENTRY(speck64_xts_decrypt_neon) 351 _speck_xts_crypt n=32, lanes=4s, decrypting=1 352ENDPROC(speck64_xts_decrypt_neon) 353