1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.section .rodata 19 20 21.align 7 // totally strategic alignment 22_vpaes_consts: 23Lk_mc_forward: // mc_forward 24.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 25.quad 0x080B0A0904070605, 0x000302010C0F0E0D 26.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 27.quad 0x000302010C0F0E0D, 0x080B0A0904070605 28Lk_mc_backward: // mc_backward 29.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 30.quad 0x020100030E0D0C0F, 0x0A09080B06050407 31.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 32.quad 0x0A09080B06050407, 0x020100030E0D0C0F 33Lk_sr: // sr 34.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 35.quad 0x030E09040F0A0500, 0x0B06010C07020D08 36.quad 0x0F060D040B020900, 0x070E050C030A0108 37.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 38 39// 40// "Hot" constants 41// 42Lk_inv: // inv, inva 43.quad 0x0E05060F0D080180, 0x040703090A0B0C02 44.quad 0x01040A060F0B0780, 0x030D0E0C02050809 45Lk_ipt: // input transform (lo, hi) 46.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 47.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 48Lk_sbo: // sbou, sbot 49.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 50.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 51Lk_sb1: // sb1u, sb1t 52.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 53.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 54Lk_sb2: // sb2u, sb2t 55.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 56.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 57 58// 59// Decryption stuff 60// 61Lk_dipt: // decryption input transform 62.quad 0x0F505B040B545F00, 0x154A411E114E451A 63.quad 0x86E383E660056500, 0x12771772F491F194 64Lk_dsbo: // decryption sbox final output 65.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 66.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 67Lk_dsb9: // decryption sbox output *9*u, *9*t 68.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 69.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 70Lk_dsbd: // decryption sbox output *D*u, *D*t 71.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 72.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 73Lk_dsbb: // decryption sbox output *B*u, *B*t 74.quad 0xD022649296B44200, 0x602646F6B0F2D404 75.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 76Lk_dsbe: // decryption sbox output *E*u, *E*t 77.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 78.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 79 80// 81// Key schedule constants 82// 83Lk_dksd: // decryption key schedule: invskew x*D 84.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 85.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 86Lk_dksb: // decryption key schedule: invskew x*B 87.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 88.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 89Lk_dkse: // decryption key schedule: invskew x*E + 0x63 90.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 91.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 92Lk_dks9: // decryption key schedule: invskew x*9 93.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 94.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 95 96Lk_rcon: // rcon 97.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 98 99Lk_opt: // output transform 100.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 101.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 102Lk_deskew: // deskew tables: inverts the sbox's "skew" 103.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 104.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 105 106.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 107.align 2 108 109.align 6 110 111.text 112## 113## _aes_preheat 114## 115## Fills register %r10 -> .aes_consts (so you can -fPIC) 116## and %xmm9-%xmm15 as specified below. 117## 118.def _vpaes_encrypt_preheat 119 .type 32 120.endef 121.align 4 122_vpaes_encrypt_preheat: 123 adrp x10, Lk_inv 124 add x10, x10, :lo12:Lk_inv 125 movi v17.16b, #0x0f 126 ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv 127 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo 128 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 129 ret 130 131 132## 133## _aes_encrypt_core 134## 135## AES-encrypt %xmm0. 136## 137## Inputs: 138## %xmm0 = input 139## %xmm9-%xmm15 as in _vpaes_preheat 140## (%rdx) = scheduled keys 141## 142## Output in %xmm0 143## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 144## Preserves %xmm6 - %xmm8 so you get some local vectors 145## 146## 147.def _vpaes_encrypt_core 148 .type 32 149.endef 150.align 4 151_vpaes_encrypt_core: 152 mov x9, x2 153 ldr w8, [x2,#240] // pull rounds 154 adrp x11, Lk_mc_forward+16 155 add x11, x11, :lo12:Lk_mc_forward+16 156 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 157 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 158 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 159 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 160 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 161 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 162 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 163 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 164 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 165 b Lenc_entry 166 167.align 4 168Lenc_loop: 169 // middle of middle round 170 add x10, x11, #0x40 171 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 172 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] 173 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 174 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 175 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 176 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 177 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 178 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] 179 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 180 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 181 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 182 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 183 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 184 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 185 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 186 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 187 sub w8, w8, #1 // nr-- 188 189Lenc_entry: 190 // top of round 191 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 192 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 193 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 194 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 195 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 196 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 197 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 198 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 199 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 200 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 201 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 202 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 203 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 204 cbnz w8, Lenc_loop 205 206 // middle of last round 207 add x10, x11, #0x80 208 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 209 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 210 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 211 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] 212 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 213 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 214 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 215 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 216 ret 217 218 219.globl vpaes_encrypt 220 221.def vpaes_encrypt 222 .type 32 223.endef 224.align 4 225vpaes_encrypt: 226 AARCH64_SIGN_LINK_REGISTER 227 stp x29,x30,[sp,#-16]! 228 add x29,sp,#0 229 230 ld1 {v7.16b}, [x0] 231 bl _vpaes_encrypt_preheat 232 bl _vpaes_encrypt_core 233 st1 {v0.16b}, [x1] 234 235 ldp x29,x30,[sp],#16 236 AARCH64_VALIDATE_LINK_REGISTER 237 ret 238 239 240.def _vpaes_encrypt_2x 241 .type 32 242.endef 243.align 4 244_vpaes_encrypt_2x: 245 mov x9, x2 246 ldr w8, [x2,#240] // pull rounds 247 adrp x11, Lk_mc_forward+16 248 add x11, x11, :lo12:Lk_mc_forward+16 249 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 250 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 251 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 252 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 253 and v9.16b, v15.16b, v17.16b 254 ushr v8.16b, v15.16b, #4 255 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 256 tbl v9.16b, {v20.16b}, v9.16b 257 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 258 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 259 tbl v10.16b, {v21.16b}, v8.16b 260 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 261 eor v8.16b, v9.16b, v16.16b 262 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 263 eor v8.16b, v8.16b, v10.16b 264 b Lenc_2x_entry 265 266.align 4 267Lenc_2x_loop: 268 // middle of middle round 269 add x10, x11, #0x40 270 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 271 tbl v12.16b, {v25.16b}, v10.16b 272 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] 273 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 274 tbl v8.16b, {v24.16b}, v11.16b 275 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 276 eor v12.16b, v12.16b, v16.16b 277 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 278 tbl v13.16b, {v27.16b}, v10.16b 279 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 280 eor v8.16b, v8.16b, v12.16b 281 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 282 tbl v10.16b, {v26.16b}, v11.16b 283 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] 284 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 285 tbl v11.16b, {v8.16b}, v1.16b 286 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 287 eor v10.16b, v10.16b, v13.16b 288 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 289 tbl v8.16b, {v8.16b}, v4.16b 290 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 291 eor v11.16b, v11.16b, v10.16b 292 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 293 tbl v12.16b, {v11.16b},v1.16b 294 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 295 eor v8.16b, v8.16b, v11.16b 296 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 297 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 298 eor v8.16b, v8.16b, v12.16b 299 sub w8, w8, #1 // nr-- 300 301Lenc_2x_entry: 302 // top of round 303 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 304 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 305 and v9.16b, v8.16b, v17.16b 306 ushr v8.16b, v8.16b, #4 307 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 308 tbl v13.16b, {v19.16b},v9.16b 309 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 310 eor v9.16b, v9.16b, v8.16b 311 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 312 tbl v11.16b, {v18.16b},v8.16b 313 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 314 tbl v12.16b, {v18.16b},v9.16b 315 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 316 eor v11.16b, v11.16b, v13.16b 317 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 318 eor v12.16b, v12.16b, v13.16b 319 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 320 tbl v10.16b, {v18.16b},v11.16b 321 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 322 tbl v11.16b, {v18.16b},v12.16b 323 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 324 eor v10.16b, v10.16b, v9.16b 325 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 326 eor v11.16b, v11.16b, v8.16b 327 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 328 cbnz w8, Lenc_2x_loop 329 330 // middle of last round 331 add x10, x11, #0x80 332 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 333 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 334 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 335 tbl v12.16b, {v22.16b}, v10.16b 336 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] 337 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 338 tbl v8.16b, {v23.16b}, v11.16b 339 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 340 eor v12.16b, v12.16b, v16.16b 341 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 342 eor v8.16b, v8.16b, v12.16b 343 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 344 tbl v1.16b, {v8.16b},v1.16b 345 ret 346 347 348.def _vpaes_decrypt_preheat 349 .type 32 350.endef 351.align 4 352_vpaes_decrypt_preheat: 353 adrp x10, Lk_inv 354 add x10, x10, :lo12:Lk_inv 355 movi v17.16b, #0x0f 356 adrp x11, Lk_dipt 357 add x11, x11, :lo12:Lk_dipt 358 ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv 359 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo 360 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd 361 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe 362 ret 363 364 365## 366## Decryption core 367## 368## Same API as encryption core. 369## 370.def _vpaes_decrypt_core 371 .type 32 372.endef 373.align 4 374_vpaes_decrypt_core: 375 mov x9, x2 376 ldr w8, [x2,#240] // pull rounds 377 378 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 379 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 380 eor x11, x11, #0x30 // xor $0x30, %r11 381 adrp x10, Lk_sr 382 add x10, x10, :lo12:Lk_sr 383 and x11, x11, #0x30 // and $0x30, %r11 384 add x11, x11, x10 385 adrp x10, Lk_mc_forward+48 386 add x10, x10, :lo12:Lk_mc_forward+48 387 388 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 389 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 390 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 391 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 392 ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 393 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 394 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 395 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 396 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 397 b Ldec_entry 398 399.align 4 400Ldec_loop: 401// 402// Inverse mix columns 403// 404 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 405 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 406 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 407 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 408 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 409 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 410 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 411 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 412 413 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 414 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 415 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 416 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 417 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 418 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 419 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 420 421 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 422 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 423 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 424 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 425 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 426 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 427 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 428 429 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 430 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 431 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 432 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 433 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 434 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 435 sub w8, w8, #1 // sub $1,%rax # nr-- 436 437Ldec_entry: 438 // top of round 439 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 440 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 441 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 442 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 443 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 444 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 445 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 446 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 447 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 448 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 449 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 450 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 451 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 452 cbnz w8, Ldec_loop 453 454 // middle of last round 455 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 456 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 457 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 458 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 459 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 460 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 461 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 462 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 463 ret 464 465 466.globl vpaes_decrypt 467 468.def vpaes_decrypt 469 .type 32 470.endef 471.align 4 472vpaes_decrypt: 473 AARCH64_SIGN_LINK_REGISTER 474 stp x29,x30,[sp,#-16]! 475 add x29,sp,#0 476 477 ld1 {v7.16b}, [x0] 478 bl _vpaes_decrypt_preheat 479 bl _vpaes_decrypt_core 480 st1 {v0.16b}, [x1] 481 482 ldp x29,x30,[sp],#16 483 AARCH64_VALIDATE_LINK_REGISTER 484 ret 485 486 487// v14-v15 input, v0-v1 output 488.def _vpaes_decrypt_2x 489 .type 32 490.endef 491.align 4 492_vpaes_decrypt_2x: 493 mov x9, x2 494 ldr w8, [x2,#240] // pull rounds 495 496 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 497 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 498 eor x11, x11, #0x30 // xor $0x30, %r11 499 adrp x10, Lk_sr 500 add x10, x10, :lo12:Lk_sr 501 and x11, x11, #0x30 // and $0x30, %r11 502 add x11, x11, x10 503 adrp x10, Lk_mc_forward+48 504 add x10, x10, :lo12:Lk_mc_forward+48 505 506 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 507 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 508 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 509 and v9.16b, v15.16b, v17.16b 510 ushr v8.16b, v15.16b, #4 511 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 512 tbl v10.16b, {v20.16b},v9.16b 513 ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 514 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 515 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 516 tbl v8.16b, {v21.16b},v8.16b 517 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 518 eor v10.16b, v10.16b, v16.16b 519 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 520 eor v8.16b, v8.16b, v10.16b 521 b Ldec_2x_entry 522 523.align 4 524Ldec_2x_loop: 525// 526// Inverse mix columns 527// 528 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 529 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 530 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 531 tbl v12.16b, {v24.16b}, v10.16b 532 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 533 tbl v9.16b, {v25.16b}, v11.16b 534 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 535 eor v8.16b, v12.16b, v16.16b 536 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 537 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 538 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 539 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 540 541 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 542 tbl v12.16b, {v26.16b}, v10.16b 543 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 544 tbl v8.16b, {v8.16b},v5.16b 545 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 546 tbl v9.16b, {v27.16b}, v11.16b 547 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 548 eor v8.16b, v8.16b, v12.16b 549 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 550 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 551 eor v8.16b, v8.16b, v9.16b 552 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 553 554 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 555 tbl v12.16b, {v28.16b}, v10.16b 556 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 557 tbl v8.16b, {v8.16b},v5.16b 558 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 559 tbl v9.16b, {v29.16b}, v11.16b 560 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 561 eor v8.16b, v8.16b, v12.16b 562 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 563 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 564 eor v8.16b, v8.16b, v9.16b 565 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 566 567 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 568 tbl v12.16b, {v30.16b}, v10.16b 569 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 570 tbl v8.16b, {v8.16b},v5.16b 571 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 572 tbl v9.16b, {v31.16b}, v11.16b 573 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 574 eor v8.16b, v8.16b, v12.16b 575 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 576 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 577 eor v8.16b, v8.16b, v9.16b 578 sub w8, w8, #1 // sub $1,%rax # nr-- 579 580Ldec_2x_entry: 581 // top of round 582 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 583 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 584 and v9.16b, v8.16b, v17.16b 585 ushr v8.16b, v8.16b, #4 586 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 587 tbl v10.16b, {v19.16b},v9.16b 588 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 589 eor v9.16b, v9.16b, v8.16b 590 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 591 tbl v11.16b, {v18.16b},v8.16b 592 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 593 tbl v12.16b, {v18.16b},v9.16b 594 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 595 eor v11.16b, v11.16b, v10.16b 596 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 597 eor v12.16b, v12.16b, v10.16b 598 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 599 tbl v10.16b, {v18.16b},v11.16b 600 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 601 tbl v11.16b, {v18.16b},v12.16b 602 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 603 eor v10.16b, v10.16b, v9.16b 604 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 605 eor v11.16b, v11.16b, v8.16b 606 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 607 cbnz w8, Ldec_2x_loop 608 609 // middle of last round 610 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 611 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 612 tbl v12.16b, {v22.16b}, v10.16b 613 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 614 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 615 tbl v9.16b, {v23.16b}, v11.16b 616 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 617 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 618 eor v12.16b, v12.16b, v16.16b 619 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 620 eor v8.16b, v9.16b, v12.16b 621 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 622 tbl v1.16b, {v8.16b},v2.16b 623 ret 624 625######################################################## 626## ## 627## AES key schedule ## 628## ## 629######################################################## 630.def _vpaes_key_preheat 631 .type 32 632.endef 633.align 4 634_vpaes_key_preheat: 635 adrp x10, Lk_inv 636 add x10, x10, :lo12:Lk_inv 637 movi v16.16b, #0x5b // Lk_s63 638 adrp x11, Lk_sb1 639 add x11, x11, :lo12:Lk_sb1 640 movi v17.16b, #0x0f // Lk_s0F 641 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt 642 adrp x10, Lk_dksd 643 add x10, x10, :lo12:Lk_dksd 644 ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 645 adrp x11, Lk_mc_forward 646 add x11, x11, :lo12:Lk_mc_forward 647 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb 648 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 649 ld1 {v8.2d}, [x10] // Lk_rcon 650 ld1 {v9.2d}, [x11] // Lk_mc_forward[0] 651 ret 652 653 654.def _vpaes_schedule_core 655 .type 32 656.endef 657.align 4 658_vpaes_schedule_core: 659 AARCH64_SIGN_LINK_REGISTER 660 stp x29, x30, [sp,#-16]! 661 add x29,sp,#0 662 663 bl _vpaes_key_preheat // load the tables 664 665 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 666 667 // input transform 668 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 669 bl _vpaes_schedule_transform 670 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 671 672 adrp x10, Lk_sr // lea Lk_sr(%rip),%r10 673 add x10, x10, :lo12:Lk_sr 674 675 add x8, x8, x10 676 cbnz w3, Lschedule_am_decrypting 677 678 // encrypting, output zeroth round key after transform 679 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 680 b Lschedule_go 681 682Lschedule_am_decrypting: 683 // decrypting, output zeroth round key after shiftrows 684 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 685 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 686 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 687 eor x8, x8, #0x30 // xor $0x30, %r8 688 689Lschedule_go: 690 cmp w1, #192 // cmp $192, %esi 691 b.hi Lschedule_256 692 b.eq Lschedule_192 693 // 128: fall though 694 695## 696## .schedule_128 697## 698## 128-bit specific part of key schedule. 699## 700## This schedule is really simple, because all its parts 701## are accomplished by the subroutines. 702## 703Lschedule_128: 704 mov x0, #10 // mov $10, %esi 705 706Loop_schedule_128: 707 sub x0, x0, #1 // dec %esi 708 bl _vpaes_schedule_round 709 cbz x0, Lschedule_mangle_last 710 bl _vpaes_schedule_mangle // write output 711 b Loop_schedule_128 712 713## 714## .aes_schedule_192 715## 716## 192-bit specific part of key schedule. 717## 718## The main body of this schedule is the same as the 128-bit 719## schedule, but with more smearing. The long, high side is 720## stored in %xmm7 as before, and the short, low side is in 721## the high bits of %xmm6. 722## 723## This schedule is somewhat nastier, however, because each 724## round produces 192 bits of key material, or 1.5 round keys. 725## Therefore, on each cycle we do 2 rounds and produce 3 round 726## keys. 727## 728.align 4 729Lschedule_192: 730 sub x0, x0, #8 731 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 732 bl _vpaes_schedule_transform // input transform 733 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 734 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 735 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 736 mov x0, #4 // mov $4, %esi 737 738Loop_schedule_192: 739 sub x0, x0, #1 // dec %esi 740 bl _vpaes_schedule_round 741 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 742 bl _vpaes_schedule_mangle // save key n 743 bl _vpaes_schedule_192_smear 744 bl _vpaes_schedule_mangle // save key n+1 745 bl _vpaes_schedule_round 746 cbz x0, Lschedule_mangle_last 747 bl _vpaes_schedule_mangle // save key n+2 748 bl _vpaes_schedule_192_smear 749 b Loop_schedule_192 750 751## 752## .aes_schedule_256 753## 754## 256-bit specific part of key schedule. 755## 756## The structure here is very similar to the 128-bit 757## schedule, but with an additional "low side" in 758## %xmm6. The low side's rounds are the same as the 759## high side's, except no rcon and no rotation. 760## 761.align 4 762Lschedule_256: 763 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 764 bl _vpaes_schedule_transform // input transform 765 mov x0, #7 // mov $7, %esi 766 767Loop_schedule_256: 768 sub x0, x0, #1 // dec %esi 769 bl _vpaes_schedule_mangle // output low result 770 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 771 772 // high round 773 bl _vpaes_schedule_round 774 cbz x0, Lschedule_mangle_last 775 bl _vpaes_schedule_mangle 776 777 // low round. swap xmm7 and xmm6 778 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 779 movi v4.16b, #0 780 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 781 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 782 bl _vpaes_schedule_low_round 783 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 784 785 b Loop_schedule_256 786 787## 788## .aes_schedule_mangle_last 789## 790## Mangler for last round of key schedule 791## Mangles %xmm0 792## when encrypting, outputs out(%xmm0) ^ 63 793## when decrypting, outputs unskew(%xmm0) 794## 795## Always called right before return... jumps to cleanup and exits 796## 797.align 4 798Lschedule_mangle_last: 799 // schedule last round key from xmm0 800 adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew 801 add x11, x11, :lo12:Lk_deskew 802 803 cbnz w3, Lschedule_mangle_last_dec 804 805 // encrypting 806 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 807 adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform 808 add x11, x11, :lo12:Lk_opt 809 add x2, x2, #32 // add $32, %rdx 810 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 811 812Lschedule_mangle_last_dec: 813 ld1 {v20.2d,v21.2d}, [x11] // reload constants 814 sub x2, x2, #16 // add $-16, %rdx 815 eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 816 bl _vpaes_schedule_transform // output transform 817 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 818 819 // cleanup 820 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 821 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 822 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 823 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 824 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 825 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 826 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 827 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 828 ldp x29, x30, [sp],#16 829 AARCH64_VALIDATE_LINK_REGISTER 830 ret 831 832 833## 834## .aes_schedule_192_smear 835## 836## Smear the short, low side in the 192-bit key schedule. 837## 838## Inputs: 839## %xmm7: high side, b a x y 840## %xmm6: low side, d c 0 0 841## %xmm13: 0 842## 843## Outputs: 844## %xmm6: b+c+d b+c 0 0 845## %xmm0: b+c+d b+c b a 846## 847.def _vpaes_schedule_192_smear 848 .type 32 849.endef 850.align 4 851_vpaes_schedule_192_smear: 852 movi v1.16b, #0 853 dup v0.4s, v7.s[3] 854 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 855 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 856 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 857 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 858 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 859 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 860 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 861 ret 862 863 864## 865## .aes_schedule_round 866## 867## Runs one main round of the key schedule on %xmm0, %xmm7 868## 869## Specifically, runs subbytes on the high dword of %xmm0 870## then rotates it by one byte and xors into the low dword of 871## %xmm7. 872## 873## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 874## next rcon. 875## 876## Smears the dwords of %xmm7 by xoring the low into the 877## second low, result into third, result into highest. 878## 879## Returns results in %xmm7 = %xmm0. 880## Clobbers %xmm1-%xmm4, %r11. 881## 882.def _vpaes_schedule_round 883 .type 32 884.endef 885.align 4 886_vpaes_schedule_round: 887 // extract rcon from xmm8 888 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 889 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 890 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 891 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 892 893 // rotate 894 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 895 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 896 897 // fall through... 898 899 // low round: same as high round, but no rotation and no rcon. 900_vpaes_schedule_low_round: 901 // smear xmm7 902 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 903 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 904 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 905 906 // subbytes 907 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 908 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 909 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 910 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 911 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 912 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 913 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 914 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 915 eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 916 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 917 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 918 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 919 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 920 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 921 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 922 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 923 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 924 925 // add in smeared stuff 926 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 927 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 928 ret 929 930 931## 932## .aes_schedule_transform 933## 934## Linear-transform %xmm0 according to tables at (%r11) 935## 936## Requires that %xmm9 = 0x0F0F... as in preheat 937## Output in %xmm0 938## Clobbers %xmm1, %xmm2 939## 940.def _vpaes_schedule_transform 941 .type 32 942.endef 943.align 4 944_vpaes_schedule_transform: 945 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 946 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 947 // vmovdqa (%r11), %xmm2 # lo 948 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 949 // vmovdqa 16(%r11), %xmm1 # hi 950 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 951 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 952 ret 953 954 955## 956## .aes_schedule_mangle 957## 958## Mangle xmm0 from (basis-transformed) standard version 959## to our version. 960## 961## On encrypt, 962## xor with 0x63 963## multiply by circulant 0,1,1,1 964## apply shiftrows transform 965## 966## On decrypt, 967## xor with 0x63 968## multiply by "inverse mixcolumns" circulant E,B,D,9 969## deskew 970## apply shiftrows transform 971## 972## 973## Writes out to (%rdx), and increments or decrements it 974## Keeps track of round number mod 4 in %r8 975## Preserves xmm0 976## Clobbers xmm1-xmm5 977## 978.def _vpaes_schedule_mangle 979 .type 32 980.endef 981.align 4 982_vpaes_schedule_mangle: 983 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 984 // vmovdqa .Lk_mc_forward(%rip),%xmm5 985 cbnz w3, Lschedule_mangle_dec 986 987 // encrypting 988 eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 989 add x2, x2, #16 // add $16, %rdx 990 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 991 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 992 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 993 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 994 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 995 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 996 997 b Lschedule_mangle_both 998.align 4 999Lschedule_mangle_dec: 1000 // inverse mix columns 1001 // lea .Lk_dksd(%rip),%r11 1002 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 1003 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 1004 1005 // vmovdqa 0x00(%r11), %xmm2 1006 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1007 // vmovdqa 0x10(%r11), %xmm3 1008 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1009 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1010 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1011 1012 // vmovdqa 0x20(%r11), %xmm2 1013 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1014 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1015 // vmovdqa 0x30(%r11), %xmm3 1016 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1017 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1018 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1019 1020 // vmovdqa 0x40(%r11), %xmm2 1021 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1022 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1023 // vmovdqa 0x50(%r11), %xmm3 1024 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1025 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 1026 1027 // vmovdqa 0x60(%r11), %xmm2 1028 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1029 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1030 // vmovdqa 0x70(%r11), %xmm4 1031 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 1032 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1033 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1034 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 1035 1036 sub x2, x2, #16 // add $-16, %rdx 1037 1038Lschedule_mangle_both: 1039 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1040 add x8, x8, #48 // add $-16, %r8 1041 and x8, x8, #~(1<<6) // and $0x30, %r8 1042 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 1043 ret 1044 1045 1046.globl vpaes_set_encrypt_key 1047 1048.def vpaes_set_encrypt_key 1049 .type 32 1050.endef 1051.align 4 1052vpaes_set_encrypt_key: 1053 AARCH64_SIGN_LINK_REGISTER 1054 stp x29,x30,[sp,#-16]! 1055 add x29,sp,#0 1056 stp d8,d9,[sp,#-16]! // ABI spec says so 1057 1058 lsr w9, w1, #5 // shr $5,%eax 1059 add w9, w9, #5 // $5,%eax 1060 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1061 1062 mov w3, #0 // mov $0,%ecx 1063 mov x8, #0x30 // mov $0x30,%r8d 1064 bl _vpaes_schedule_core 1065 eor x0, x0, x0 1066 1067 ldp d8,d9,[sp],#16 1068 ldp x29,x30,[sp],#16 1069 AARCH64_VALIDATE_LINK_REGISTER 1070 ret 1071 1072 1073.globl vpaes_set_decrypt_key 1074 1075.def vpaes_set_decrypt_key 1076 .type 32 1077.endef 1078.align 4 1079vpaes_set_decrypt_key: 1080 AARCH64_SIGN_LINK_REGISTER 1081 stp x29,x30,[sp,#-16]! 1082 add x29,sp,#0 1083 stp d8,d9,[sp,#-16]! // ABI spec says so 1084 1085 lsr w9, w1, #5 // shr $5,%eax 1086 add w9, w9, #5 // $5,%eax 1087 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1088 lsl w9, w9, #4 // shl $4,%eax 1089 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1090 add x2, x2, x9 1091 1092 mov w3, #1 // mov $1,%ecx 1093 lsr w8, w1, #1 // shr $1,%r8d 1094 and x8, x8, #32 // and $32,%r8d 1095 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1096 bl _vpaes_schedule_core 1097 1098 ldp d8,d9,[sp],#16 1099 ldp x29,x30,[sp],#16 1100 AARCH64_VALIDATE_LINK_REGISTER 1101 ret 1102 1103.globl vpaes_cbc_encrypt 1104 1105.def vpaes_cbc_encrypt 1106 .type 32 1107.endef 1108.align 4 1109vpaes_cbc_encrypt: 1110 AARCH64_SIGN_LINK_REGISTER 1111 cbz x2, Lcbc_abort 1112 cmp w5, #0 // check direction 1113 b.eq vpaes_cbc_decrypt 1114 1115 stp x29,x30,[sp,#-16]! 1116 add x29,sp,#0 1117 1118 mov x17, x2 // reassign 1119 mov x2, x3 // reassign 1120 1121 ld1 {v0.16b}, [x4] // load ivec 1122 bl _vpaes_encrypt_preheat 1123 b Lcbc_enc_loop 1124 1125.align 4 1126Lcbc_enc_loop: 1127 ld1 {v7.16b}, [x0],#16 // load input 1128 eor v7.16b, v7.16b, v0.16b // xor with ivec 1129 bl _vpaes_encrypt_core 1130 st1 {v0.16b}, [x1],#16 // save output 1131 subs x17, x17, #16 1132 b.hi Lcbc_enc_loop 1133 1134 st1 {v0.16b}, [x4] // write ivec 1135 1136 ldp x29,x30,[sp],#16 1137 AARCH64_VALIDATE_LINK_REGISTER 1138Lcbc_abort: 1139 ret 1140 1141 1142.def vpaes_cbc_decrypt 1143 .type 32 1144.endef 1145.align 4 1146vpaes_cbc_decrypt: 1147 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1148 // only from vpaes_cbc_encrypt which has already signed the return address. 1149 stp x29,x30,[sp,#-16]! 1150 add x29,sp,#0 1151 stp d8,d9,[sp,#-16]! // ABI spec says so 1152 stp d10,d11,[sp,#-16]! 1153 stp d12,d13,[sp,#-16]! 1154 stp d14,d15,[sp,#-16]! 1155 1156 mov x17, x2 // reassign 1157 mov x2, x3 // reassign 1158 ld1 {v6.16b}, [x4] // load ivec 1159 bl _vpaes_decrypt_preheat 1160 tst x17, #16 1161 b.eq Lcbc_dec_loop2x 1162 1163 ld1 {v7.16b}, [x0], #16 // load input 1164 bl _vpaes_decrypt_core 1165 eor v0.16b, v0.16b, v6.16b // xor with ivec 1166 orr v6.16b, v7.16b, v7.16b // next ivec value 1167 st1 {v0.16b}, [x1], #16 1168 subs x17, x17, #16 1169 b.ls Lcbc_dec_done 1170 1171.align 4 1172Lcbc_dec_loop2x: 1173 ld1 {v14.16b,v15.16b}, [x0], #32 1174 bl _vpaes_decrypt_2x 1175 eor v0.16b, v0.16b, v6.16b // xor with ivec 1176 eor v1.16b, v1.16b, v14.16b 1177 orr v6.16b, v15.16b, v15.16b 1178 st1 {v0.16b,v1.16b}, [x1], #32 1179 subs x17, x17, #32 1180 b.hi Lcbc_dec_loop2x 1181 1182Lcbc_dec_done: 1183 st1 {v6.16b}, [x4] 1184 1185 ldp d14,d15,[sp],#16 1186 ldp d12,d13,[sp],#16 1187 ldp d10,d11,[sp],#16 1188 ldp d8,d9,[sp],#16 1189 ldp x29,x30,[sp],#16 1190 AARCH64_VALIDATE_LINK_REGISTER 1191 ret 1192 1193.globl vpaes_ctr32_encrypt_blocks 1194 1195.def vpaes_ctr32_encrypt_blocks 1196 .type 32 1197.endef 1198.align 4 1199vpaes_ctr32_encrypt_blocks: 1200 AARCH64_SIGN_LINK_REGISTER 1201 stp x29,x30,[sp,#-16]! 1202 add x29,sp,#0 1203 stp d8,d9,[sp,#-16]! // ABI spec says so 1204 stp d10,d11,[sp,#-16]! 1205 stp d12,d13,[sp,#-16]! 1206 stp d14,d15,[sp,#-16]! 1207 1208 cbz x2, Lctr32_done 1209 1210 // Note, unlike the other functions, x2 here is measured in blocks, 1211 // not bytes. 1212 mov x17, x2 1213 mov x2, x3 1214 1215 // Load the IV and counter portion. 1216 ldr w6, [x4, #12] 1217 ld1 {v7.16b}, [x4] 1218 1219 bl _vpaes_encrypt_preheat 1220 tst x17, #1 1221 rev w6, w6 // The counter is big-endian. 1222 b.eq Lctr32_prep_loop 1223 1224 // Handle one block so the remaining block count is even for 1225 // _vpaes_encrypt_2x. 1226 ld1 {v6.16b}, [x0], #16 // Load input ahead of time 1227 bl _vpaes_encrypt_core 1228 eor v0.16b, v0.16b, v6.16b // XOR input and result 1229 st1 {v0.16b}, [x1], #16 1230 subs x17, x17, #1 1231 // Update the counter. 1232 add w6, w6, #1 1233 rev w7, w6 1234 mov v7.s[3], w7 1235 b.ls Lctr32_done 1236 1237Lctr32_prep_loop: 1238 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 1239 // uses v14 and v15. 1240 mov v15.16b, v7.16b 1241 mov v14.16b, v7.16b 1242 add w6, w6, #1 1243 rev w7, w6 1244 mov v15.s[3], w7 1245 1246Lctr32_loop: 1247 ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time 1248 bl _vpaes_encrypt_2x 1249 eor v0.16b, v0.16b, v6.16b // XOR input and result 1250 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 1251 st1 {v0.16b,v1.16b}, [x1], #32 1252 subs x17, x17, #2 1253 // Update the counter. 1254 add w7, w6, #1 1255 add w6, w6, #2 1256 rev w7, w7 1257 mov v14.s[3], w7 1258 rev w7, w6 1259 mov v15.s[3], w7 1260 b.hi Lctr32_loop 1261 1262Lctr32_done: 1263 ldp d14,d15,[sp],#16 1264 ldp d12,d13,[sp],#16 1265 ldp d10,d11,[sp],#16 1266 ldp d8,d9,[sp],#16 1267 ldp x29,x30,[sp],#16 1268 AARCH64_VALIDATE_LINK_REGISTER 1269 ret 1270 1271#endif 1272#endif // !OPENSSL_NO_ASM 1273