1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16 17.section __TEXT,__const 18 19 20.align 7 // totally strategic alignment 21_vpaes_consts: 22Lk_mc_forward: // mc_forward 23.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 24.quad 0x080B0A0904070605, 0x000302010C0F0E0D 25.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 26.quad 0x000302010C0F0E0D, 0x080B0A0904070605 27Lk_mc_backward: // mc_backward 28.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 29.quad 0x020100030E0D0C0F, 0x0A09080B06050407 30.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 31.quad 0x0A09080B06050407, 0x020100030E0D0C0F 32Lk_sr: // sr 33.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 34.quad 0x030E09040F0A0500, 0x0B06010C07020D08 35.quad 0x0F060D040B020900, 0x070E050C030A0108 36.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 37 38// 39// "Hot" constants 40// 41Lk_inv: // inv, inva 42.quad 0x0E05060F0D080180, 0x040703090A0B0C02 43.quad 0x01040A060F0B0780, 0x030D0E0C02050809 44Lk_ipt: // input transform (lo, hi) 45.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 46.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 47Lk_sbo: // sbou, sbot 48.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 49.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 50Lk_sb1: // sb1u, sb1t 51.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 52.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 53Lk_sb2: // sb2u, sb2t 54.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 55.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 56 57// 58// Decryption stuff 59// 60Lk_dipt: // decryption input transform 61.quad 0x0F505B040B545F00, 0x154A411E114E451A 62.quad 0x86E383E660056500, 0x12771772F491F194 63Lk_dsbo: // decryption sbox final output 64.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 65.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 66Lk_dsb9: // decryption sbox output *9*u, *9*t 67.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 68.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 69Lk_dsbd: // decryption sbox output *D*u, *D*t 70.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 71.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 72Lk_dsbb: // decryption sbox output *B*u, *B*t 73.quad 0xD022649296B44200, 0x602646F6B0F2D404 74.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 75Lk_dsbe: // decryption sbox output *E*u, *E*t 76.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 77.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 78 79// 80// Key schedule constants 81// 82Lk_dksd: // decryption key schedule: invskew x*D 83.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 84.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 85Lk_dksb: // decryption key schedule: invskew x*B 86.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 87.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 88Lk_dkse: // decryption key schedule: invskew x*E + 0x63 89.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 90.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 91Lk_dks9: // decryption key schedule: invskew x*9 92.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 93.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 94 95Lk_rcon: // rcon 96.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 97 98Lk_opt: // output transform 99.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 100.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 101Lk_deskew: // deskew tables: inverts the sbox's "skew" 102.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 103.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 104 105.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 106.align 2 107 108.align 6 109 110.text 111## 112## _aes_preheat 113## 114## Fills register %r10 -> .aes_consts (so you can -fPIC) 115## and %xmm9-%xmm15 as specified below. 116## 117 118.align 4 119_vpaes_encrypt_preheat: 120 adrp x10, Lk_inv@PAGE 121 add x10, x10, Lk_inv@PAGEOFF 122 movi v17.16b, #0x0f 123 ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv 124 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo 125 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 126 ret 127 128 129## 130## _aes_encrypt_core 131## 132## AES-encrypt %xmm0. 133## 134## Inputs: 135## %xmm0 = input 136## %xmm9-%xmm15 as in _vpaes_preheat 137## (%rdx) = scheduled keys 138## 139## Output in %xmm0 140## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 141## Preserves %xmm6 - %xmm8 so you get some local vectors 142## 143## 144 145.align 4 146_vpaes_encrypt_core: 147 mov x9, x2 148 ldr w8, [x2,#240] // pull rounds 149 adrp x11, Lk_mc_forward@PAGE+16 150 add x11, x11, Lk_mc_forward@PAGEOFF+16 151 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 152 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 153 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 154 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 155 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 156 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 157 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 158 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 159 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 160 b Lenc_entry 161 162.align 4 163Lenc_loop: 164 // middle of middle round 165 add x10, x11, #0x40 166 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 167 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] 168 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 169 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 170 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 171 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 172 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 173 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] 174 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 175 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 176 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 177 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 178 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 179 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 180 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 181 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 182 sub w8, w8, #1 // nr-- 183 184Lenc_entry: 185 // top of round 186 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 187 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 188 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 189 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 190 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 191 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 192 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 193 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 194 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 195 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 196 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 197 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 198 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 199 cbnz w8, Lenc_loop 200 201 // middle of last round 202 add x10, x11, #0x80 203 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 204 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 205 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 206 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] 207 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 208 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 209 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 210 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 211 ret 212 213 214.globl _vpaes_encrypt 215.private_extern _vpaes_encrypt 216 217.align 4 218_vpaes_encrypt: 219 AARCH64_SIGN_LINK_REGISTER 220 stp x29,x30,[sp,#-16]! 221 add x29,sp,#0 222 223 ld1 {v7.16b}, [x0] 224 bl _vpaes_encrypt_preheat 225 bl _vpaes_encrypt_core 226 st1 {v0.16b}, [x1] 227 228 ldp x29,x30,[sp],#16 229 AARCH64_VALIDATE_LINK_REGISTER 230 ret 231 232 233 234.align 4 235_vpaes_encrypt_2x: 236 mov x9, x2 237 ldr w8, [x2,#240] // pull rounds 238 adrp x11, Lk_mc_forward@PAGE+16 239 add x11, x11, Lk_mc_forward@PAGEOFF+16 240 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 241 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 242 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 243 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 244 and v9.16b, v15.16b, v17.16b 245 ushr v8.16b, v15.16b, #4 246 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 247 tbl v9.16b, {v20.16b}, v9.16b 248 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 249 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 250 tbl v10.16b, {v21.16b}, v8.16b 251 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 252 eor v8.16b, v9.16b, v16.16b 253 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 254 eor v8.16b, v8.16b, v10.16b 255 b Lenc_2x_entry 256 257.align 4 258Lenc_2x_loop: 259 // middle of middle round 260 add x10, x11, #0x40 261 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 262 tbl v12.16b, {v25.16b}, v10.16b 263 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] 264 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 265 tbl v8.16b, {v24.16b}, v11.16b 266 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 267 eor v12.16b, v12.16b, v16.16b 268 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 269 tbl v13.16b, {v27.16b}, v10.16b 270 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 271 eor v8.16b, v8.16b, v12.16b 272 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 273 tbl v10.16b, {v26.16b}, v11.16b 274 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] 275 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 276 tbl v11.16b, {v8.16b}, v1.16b 277 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 278 eor v10.16b, v10.16b, v13.16b 279 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 280 tbl v8.16b, {v8.16b}, v4.16b 281 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 282 eor v11.16b, v11.16b, v10.16b 283 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 284 tbl v12.16b, {v11.16b},v1.16b 285 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 286 eor v8.16b, v8.16b, v11.16b 287 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 288 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 289 eor v8.16b, v8.16b, v12.16b 290 sub w8, w8, #1 // nr-- 291 292Lenc_2x_entry: 293 // top of round 294 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 295 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 296 and v9.16b, v8.16b, v17.16b 297 ushr v8.16b, v8.16b, #4 298 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 299 tbl v13.16b, {v19.16b},v9.16b 300 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 301 eor v9.16b, v9.16b, v8.16b 302 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 303 tbl v11.16b, {v18.16b},v8.16b 304 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 305 tbl v12.16b, {v18.16b},v9.16b 306 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 307 eor v11.16b, v11.16b, v13.16b 308 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 309 eor v12.16b, v12.16b, v13.16b 310 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 311 tbl v10.16b, {v18.16b},v11.16b 312 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 313 tbl v11.16b, {v18.16b},v12.16b 314 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 315 eor v10.16b, v10.16b, v9.16b 316 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 317 eor v11.16b, v11.16b, v8.16b 318 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 319 cbnz w8, Lenc_2x_loop 320 321 // middle of last round 322 add x10, x11, #0x80 323 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 324 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 325 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 326 tbl v12.16b, {v22.16b}, v10.16b 327 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] 328 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 329 tbl v8.16b, {v23.16b}, v11.16b 330 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 331 eor v12.16b, v12.16b, v16.16b 332 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 333 eor v8.16b, v8.16b, v12.16b 334 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 335 tbl v1.16b, {v8.16b},v1.16b 336 ret 337 338 339 340.align 4 341_vpaes_decrypt_preheat: 342 adrp x10, Lk_inv@PAGE 343 add x10, x10, Lk_inv@PAGEOFF 344 movi v17.16b, #0x0f 345 adrp x11, Lk_dipt@PAGE 346 add x11, x11, Lk_dipt@PAGEOFF 347 ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv 348 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo 349 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd 350 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe 351 ret 352 353 354## 355## Decryption core 356## 357## Same API as encryption core. 358## 359 360.align 4 361_vpaes_decrypt_core: 362 mov x9, x2 363 ldr w8, [x2,#240] // pull rounds 364 365 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 366 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 367 eor x11, x11, #0x30 // xor $0x30, %r11 368 adrp x10, Lk_sr@PAGE 369 add x10, x10, Lk_sr@PAGEOFF 370 and x11, x11, #0x30 // and $0x30, %r11 371 add x11, x11, x10 372 adrp x10, Lk_mc_forward@PAGE+48 373 add x10, x10, Lk_mc_forward@PAGEOFF+48 374 375 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 376 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 377 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 378 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 379 ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 380 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 381 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 382 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 383 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 384 b Ldec_entry 385 386.align 4 387Ldec_loop: 388// 389// Inverse mix columns 390// 391 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 392 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 393 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 394 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 395 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 396 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 397 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 398 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 399 400 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 401 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 402 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 403 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 404 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 405 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 406 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 407 408 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 409 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 410 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 411 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 412 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 413 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 414 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 415 416 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 417 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 418 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 419 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 420 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 421 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 422 sub w8, w8, #1 // sub $1,%rax # nr-- 423 424Ldec_entry: 425 // top of round 426 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 427 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 428 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 429 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 430 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 431 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 432 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 433 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 434 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 435 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 436 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 437 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 438 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 439 cbnz w8, Ldec_loop 440 441 // middle of last round 442 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 443 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 444 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 445 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 446 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 447 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 448 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 449 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 450 ret 451 452 453.globl _vpaes_decrypt 454.private_extern _vpaes_decrypt 455 456.align 4 457_vpaes_decrypt: 458 AARCH64_SIGN_LINK_REGISTER 459 stp x29,x30,[sp,#-16]! 460 add x29,sp,#0 461 462 ld1 {v7.16b}, [x0] 463 bl _vpaes_decrypt_preheat 464 bl _vpaes_decrypt_core 465 st1 {v0.16b}, [x1] 466 467 ldp x29,x30,[sp],#16 468 AARCH64_VALIDATE_LINK_REGISTER 469 ret 470 471 472// v14-v15 input, v0-v1 output 473 474.align 4 475_vpaes_decrypt_2x: 476 mov x9, x2 477 ldr w8, [x2,#240] // pull rounds 478 479 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 480 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 481 eor x11, x11, #0x30 // xor $0x30, %r11 482 adrp x10, Lk_sr@PAGE 483 add x10, x10, Lk_sr@PAGEOFF 484 and x11, x11, #0x30 // and $0x30, %r11 485 add x11, x11, x10 486 adrp x10, Lk_mc_forward@PAGE+48 487 add x10, x10, Lk_mc_forward@PAGEOFF+48 488 489 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 490 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 491 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 492 and v9.16b, v15.16b, v17.16b 493 ushr v8.16b, v15.16b, #4 494 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 495 tbl v10.16b, {v20.16b},v9.16b 496 ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 497 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 498 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 499 tbl v8.16b, {v21.16b},v8.16b 500 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 501 eor v10.16b, v10.16b, v16.16b 502 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 503 eor v8.16b, v8.16b, v10.16b 504 b Ldec_2x_entry 505 506.align 4 507Ldec_2x_loop: 508// 509// Inverse mix columns 510// 511 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 512 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 513 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 514 tbl v12.16b, {v24.16b}, v10.16b 515 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 516 tbl v9.16b, {v25.16b}, v11.16b 517 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 518 eor v8.16b, v12.16b, v16.16b 519 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 520 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 521 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 522 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 523 524 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 525 tbl v12.16b, {v26.16b}, v10.16b 526 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 527 tbl v8.16b, {v8.16b},v5.16b 528 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 529 tbl v9.16b, {v27.16b}, v11.16b 530 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 531 eor v8.16b, v8.16b, v12.16b 532 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 533 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 534 eor v8.16b, v8.16b, v9.16b 535 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 536 537 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 538 tbl v12.16b, {v28.16b}, v10.16b 539 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 540 tbl v8.16b, {v8.16b},v5.16b 541 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 542 tbl v9.16b, {v29.16b}, v11.16b 543 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 544 eor v8.16b, v8.16b, v12.16b 545 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 546 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 547 eor v8.16b, v8.16b, v9.16b 548 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 549 550 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 551 tbl v12.16b, {v30.16b}, v10.16b 552 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 553 tbl v8.16b, {v8.16b},v5.16b 554 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 555 tbl v9.16b, {v31.16b}, v11.16b 556 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 557 eor v8.16b, v8.16b, v12.16b 558 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 559 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 560 eor v8.16b, v8.16b, v9.16b 561 sub w8, w8, #1 // sub $1,%rax # nr-- 562 563Ldec_2x_entry: 564 // top of round 565 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 566 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 567 and v9.16b, v8.16b, v17.16b 568 ushr v8.16b, v8.16b, #4 569 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 570 tbl v10.16b, {v19.16b},v9.16b 571 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 572 eor v9.16b, v9.16b, v8.16b 573 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 574 tbl v11.16b, {v18.16b},v8.16b 575 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 576 tbl v12.16b, {v18.16b},v9.16b 577 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 578 eor v11.16b, v11.16b, v10.16b 579 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 580 eor v12.16b, v12.16b, v10.16b 581 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 582 tbl v10.16b, {v18.16b},v11.16b 583 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 584 tbl v11.16b, {v18.16b},v12.16b 585 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 586 eor v10.16b, v10.16b, v9.16b 587 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 588 eor v11.16b, v11.16b, v8.16b 589 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 590 cbnz w8, Ldec_2x_loop 591 592 // middle of last round 593 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 594 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 595 tbl v12.16b, {v22.16b}, v10.16b 596 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 597 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 598 tbl v9.16b, {v23.16b}, v11.16b 599 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 600 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 601 eor v12.16b, v12.16b, v16.16b 602 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 603 eor v8.16b, v9.16b, v12.16b 604 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 605 tbl v1.16b, {v8.16b},v2.16b 606 ret 607 608######################################################## 609## ## 610## AES key schedule ## 611## ## 612######################################################## 613 614.align 4 615_vpaes_key_preheat: 616 adrp x10, Lk_inv@PAGE 617 add x10, x10, Lk_inv@PAGEOFF 618 movi v16.16b, #0x5b // Lk_s63 619 adrp x11, Lk_sb1@PAGE 620 add x11, x11, Lk_sb1@PAGEOFF 621 movi v17.16b, #0x0f // Lk_s0F 622 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt 623 adrp x10, Lk_dksd@PAGE 624 add x10, x10, Lk_dksd@PAGEOFF 625 ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 626 adrp x11, Lk_mc_forward@PAGE 627 add x11, x11, Lk_mc_forward@PAGEOFF 628 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb 629 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 630 ld1 {v8.2d}, [x10] // Lk_rcon 631 ld1 {v9.2d}, [x11] // Lk_mc_forward[0] 632 ret 633 634 635 636.align 4 637_vpaes_schedule_core: 638 AARCH64_SIGN_LINK_REGISTER 639 stp x29, x30, [sp,#-16]! 640 add x29,sp,#0 641 642 bl _vpaes_key_preheat // load the tables 643 644 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 645 646 // input transform 647 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 648 bl _vpaes_schedule_transform 649 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 650 651 adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 652 add x10, x10, Lk_sr@PAGEOFF 653 654 add x8, x8, x10 655 cbnz w3, Lschedule_am_decrypting 656 657 // encrypting, output zeroth round key after transform 658 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 659 b Lschedule_go 660 661Lschedule_am_decrypting: 662 // decrypting, output zeroth round key after shiftrows 663 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 664 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 665 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 666 eor x8, x8, #0x30 // xor $0x30, %r8 667 668Lschedule_go: 669 cmp w1, #192 // cmp $192, %esi 670 b.hi Lschedule_256 671 b.eq Lschedule_192 672 // 128: fall though 673 674## 675## .schedule_128 676## 677## 128-bit specific part of key schedule. 678## 679## This schedule is really simple, because all its parts 680## are accomplished by the subroutines. 681## 682Lschedule_128: 683 mov x0, #10 // mov $10, %esi 684 685Loop_schedule_128: 686 sub x0, x0, #1 // dec %esi 687 bl _vpaes_schedule_round 688 cbz x0, Lschedule_mangle_last 689 bl _vpaes_schedule_mangle // write output 690 b Loop_schedule_128 691 692## 693## .aes_schedule_192 694## 695## 192-bit specific part of key schedule. 696## 697## The main body of this schedule is the same as the 128-bit 698## schedule, but with more smearing. The long, high side is 699## stored in %xmm7 as before, and the short, low side is in 700## the high bits of %xmm6. 701## 702## This schedule is somewhat nastier, however, because each 703## round produces 192 bits of key material, or 1.5 round keys. 704## Therefore, on each cycle we do 2 rounds and produce 3 round 705## keys. 706## 707.align 4 708Lschedule_192: 709 sub x0, x0, #8 710 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 711 bl _vpaes_schedule_transform // input transform 712 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 713 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 714 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 715 mov x0, #4 // mov $4, %esi 716 717Loop_schedule_192: 718 sub x0, x0, #1 // dec %esi 719 bl _vpaes_schedule_round 720 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 721 bl _vpaes_schedule_mangle // save key n 722 bl _vpaes_schedule_192_smear 723 bl _vpaes_schedule_mangle // save key n+1 724 bl _vpaes_schedule_round 725 cbz x0, Lschedule_mangle_last 726 bl _vpaes_schedule_mangle // save key n+2 727 bl _vpaes_schedule_192_smear 728 b Loop_schedule_192 729 730## 731## .aes_schedule_256 732## 733## 256-bit specific part of key schedule. 734## 735## The structure here is very similar to the 128-bit 736## schedule, but with an additional "low side" in 737## %xmm6. The low side's rounds are the same as the 738## high side's, except no rcon and no rotation. 739## 740.align 4 741Lschedule_256: 742 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 743 bl _vpaes_schedule_transform // input transform 744 mov x0, #7 // mov $7, %esi 745 746Loop_schedule_256: 747 sub x0, x0, #1 // dec %esi 748 bl _vpaes_schedule_mangle // output low result 749 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 750 751 // high round 752 bl _vpaes_schedule_round 753 cbz x0, Lschedule_mangle_last 754 bl _vpaes_schedule_mangle 755 756 // low round. swap xmm7 and xmm6 757 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 758 movi v4.16b, #0 759 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 760 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 761 bl _vpaes_schedule_low_round 762 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 763 764 b Loop_schedule_256 765 766## 767## .aes_schedule_mangle_last 768## 769## Mangler for last round of key schedule 770## Mangles %xmm0 771## when encrypting, outputs out(%xmm0) ^ 63 772## when decrypting, outputs unskew(%xmm0) 773## 774## Always called right before return... jumps to cleanup and exits 775## 776.align 4 777Lschedule_mangle_last: 778 // schedule last round key from xmm0 779 adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew 780 add x11, x11, Lk_deskew@PAGEOFF 781 782 cbnz w3, Lschedule_mangle_last_dec 783 784 // encrypting 785 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 786 adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform 787 add x11, x11, Lk_opt@PAGEOFF 788 add x2, x2, #32 // add $32, %rdx 789 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 790 791Lschedule_mangle_last_dec: 792 ld1 {v20.2d,v21.2d}, [x11] // reload constants 793 sub x2, x2, #16 // add $-16, %rdx 794 eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 795 bl _vpaes_schedule_transform // output transform 796 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 797 798 // cleanup 799 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 800 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 801 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 802 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 803 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 804 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 805 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 806 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 807 ldp x29, x30, [sp],#16 808 AARCH64_VALIDATE_LINK_REGISTER 809 ret 810 811 812## 813## .aes_schedule_192_smear 814## 815## Smear the short, low side in the 192-bit key schedule. 816## 817## Inputs: 818## %xmm7: high side, b a x y 819## %xmm6: low side, d c 0 0 820## %xmm13: 0 821## 822## Outputs: 823## %xmm6: b+c+d b+c 0 0 824## %xmm0: b+c+d b+c b a 825## 826 827.align 4 828_vpaes_schedule_192_smear: 829 movi v1.16b, #0 830 dup v0.4s, v7.s[3] 831 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 832 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 833 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 834 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 835 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 836 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 837 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 838 ret 839 840 841## 842## .aes_schedule_round 843## 844## Runs one main round of the key schedule on %xmm0, %xmm7 845## 846## Specifically, runs subbytes on the high dword of %xmm0 847## then rotates it by one byte and xors into the low dword of 848## %xmm7. 849## 850## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 851## next rcon. 852## 853## Smears the dwords of %xmm7 by xoring the low into the 854## second low, result into third, result into highest. 855## 856## Returns results in %xmm7 = %xmm0. 857## Clobbers %xmm1-%xmm4, %r11. 858## 859 860.align 4 861_vpaes_schedule_round: 862 // extract rcon from xmm8 863 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 864 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 865 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 866 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 867 868 // rotate 869 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 870 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 871 872 // fall through... 873 874 // low round: same as high round, but no rotation and no rcon. 875_vpaes_schedule_low_round: 876 // smear xmm7 877 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 878 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 879 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 880 881 // subbytes 882 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 883 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 884 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 885 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 886 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 887 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 888 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 889 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 890 eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 891 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 892 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 893 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 894 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 895 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 896 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 897 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 898 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 899 900 // add in smeared stuff 901 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 902 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 903 ret 904 905 906## 907## .aes_schedule_transform 908## 909## Linear-transform %xmm0 according to tables at (%r11) 910## 911## Requires that %xmm9 = 0x0F0F... as in preheat 912## Output in %xmm0 913## Clobbers %xmm1, %xmm2 914## 915 916.align 4 917_vpaes_schedule_transform: 918 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 919 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 920 // vmovdqa (%r11), %xmm2 # lo 921 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 922 // vmovdqa 16(%r11), %xmm1 # hi 923 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 924 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 925 ret 926 927 928## 929## .aes_schedule_mangle 930## 931## Mangle xmm0 from (basis-transformed) standard version 932## to our version. 933## 934## On encrypt, 935## xor with 0x63 936## multiply by circulant 0,1,1,1 937## apply shiftrows transform 938## 939## On decrypt, 940## xor with 0x63 941## multiply by "inverse mixcolumns" circulant E,B,D,9 942## deskew 943## apply shiftrows transform 944## 945## 946## Writes out to (%rdx), and increments or decrements it 947## Keeps track of round number mod 4 in %r8 948## Preserves xmm0 949## Clobbers xmm1-xmm5 950## 951 952.align 4 953_vpaes_schedule_mangle: 954 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 955 // vmovdqa .Lk_mc_forward(%rip),%xmm5 956 cbnz w3, Lschedule_mangle_dec 957 958 // encrypting 959 eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 960 add x2, x2, #16 // add $16, %rdx 961 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 962 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 963 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 964 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 965 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 966 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 967 968 b Lschedule_mangle_both 969.align 4 970Lschedule_mangle_dec: 971 // inverse mix columns 972 // lea .Lk_dksd(%rip),%r11 973 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 974 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 975 976 // vmovdqa 0x00(%r11), %xmm2 977 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 978 // vmovdqa 0x10(%r11), %xmm3 979 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 980 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 981 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 982 983 // vmovdqa 0x20(%r11), %xmm2 984 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 985 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 986 // vmovdqa 0x30(%r11), %xmm3 987 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 988 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 989 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 990 991 // vmovdqa 0x40(%r11), %xmm2 992 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 993 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 994 // vmovdqa 0x50(%r11), %xmm3 995 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 996 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 997 998 // vmovdqa 0x60(%r11), %xmm2 999 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1000 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1001 // vmovdqa 0x70(%r11), %xmm4 1002 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 1003 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1004 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1005 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 1006 1007 sub x2, x2, #16 // add $-16, %rdx 1008 1009Lschedule_mangle_both: 1010 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1011 add x8, x8, #48 // add $-16, %r8 1012 and x8, x8, #~(1<<6) // and $0x30, %r8 1013 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 1014 ret 1015 1016 1017.globl _vpaes_set_encrypt_key 1018.private_extern _vpaes_set_encrypt_key 1019 1020.align 4 1021_vpaes_set_encrypt_key: 1022 AARCH64_SIGN_LINK_REGISTER 1023 stp x29,x30,[sp,#-16]! 1024 add x29,sp,#0 1025 stp d8,d9,[sp,#-16]! // ABI spec says so 1026 1027 lsr w9, w1, #5 // shr $5,%eax 1028 add w9, w9, #5 // $5,%eax 1029 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1030 1031 mov w3, #0 // mov $0,%ecx 1032 mov x8, #0x30 // mov $0x30,%r8d 1033 bl _vpaes_schedule_core 1034 eor x0, x0, x0 1035 1036 ldp d8,d9,[sp],#16 1037 ldp x29,x30,[sp],#16 1038 AARCH64_VALIDATE_LINK_REGISTER 1039 ret 1040 1041 1042.globl _vpaes_set_decrypt_key 1043.private_extern _vpaes_set_decrypt_key 1044 1045.align 4 1046_vpaes_set_decrypt_key: 1047 AARCH64_SIGN_LINK_REGISTER 1048 stp x29,x30,[sp,#-16]! 1049 add x29,sp,#0 1050 stp d8,d9,[sp,#-16]! // ABI spec says so 1051 1052 lsr w9, w1, #5 // shr $5,%eax 1053 add w9, w9, #5 // $5,%eax 1054 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1055 lsl w9, w9, #4 // shl $4,%eax 1056 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1057 add x2, x2, x9 1058 1059 mov w3, #1 // mov $1,%ecx 1060 lsr w8, w1, #1 // shr $1,%r8d 1061 and x8, x8, #32 // and $32,%r8d 1062 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1063 bl _vpaes_schedule_core 1064 1065 ldp d8,d9,[sp],#16 1066 ldp x29,x30,[sp],#16 1067 AARCH64_VALIDATE_LINK_REGISTER 1068 ret 1069 1070.globl _vpaes_cbc_encrypt 1071.private_extern _vpaes_cbc_encrypt 1072 1073.align 4 1074_vpaes_cbc_encrypt: 1075 AARCH64_SIGN_LINK_REGISTER 1076 cbz x2, Lcbc_abort 1077 cmp w5, #0 // check direction 1078 b.eq vpaes_cbc_decrypt 1079 1080 stp x29,x30,[sp,#-16]! 1081 add x29,sp,#0 1082 1083 mov x17, x2 // reassign 1084 mov x2, x3 // reassign 1085 1086 ld1 {v0.16b}, [x4] // load ivec 1087 bl _vpaes_encrypt_preheat 1088 b Lcbc_enc_loop 1089 1090.align 4 1091Lcbc_enc_loop: 1092 ld1 {v7.16b}, [x0],#16 // load input 1093 eor v7.16b, v7.16b, v0.16b // xor with ivec 1094 bl _vpaes_encrypt_core 1095 st1 {v0.16b}, [x1],#16 // save output 1096 subs x17, x17, #16 1097 b.hi Lcbc_enc_loop 1098 1099 st1 {v0.16b}, [x4] // write ivec 1100 1101 ldp x29,x30,[sp],#16 1102 AARCH64_VALIDATE_LINK_REGISTER 1103Lcbc_abort: 1104 ret 1105 1106 1107 1108.align 4 1109vpaes_cbc_decrypt: 1110 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1111 // only from vpaes_cbc_encrypt which has already signed the return address. 1112 stp x29,x30,[sp,#-16]! 1113 add x29,sp,#0 1114 stp d8,d9,[sp,#-16]! // ABI spec says so 1115 stp d10,d11,[sp,#-16]! 1116 stp d12,d13,[sp,#-16]! 1117 stp d14,d15,[sp,#-16]! 1118 1119 mov x17, x2 // reassign 1120 mov x2, x3 // reassign 1121 ld1 {v6.16b}, [x4] // load ivec 1122 bl _vpaes_decrypt_preheat 1123 tst x17, #16 1124 b.eq Lcbc_dec_loop2x 1125 1126 ld1 {v7.16b}, [x0], #16 // load input 1127 bl _vpaes_decrypt_core 1128 eor v0.16b, v0.16b, v6.16b // xor with ivec 1129 orr v6.16b, v7.16b, v7.16b // next ivec value 1130 st1 {v0.16b}, [x1], #16 1131 subs x17, x17, #16 1132 b.ls Lcbc_dec_done 1133 1134.align 4 1135Lcbc_dec_loop2x: 1136 ld1 {v14.16b,v15.16b}, [x0], #32 1137 bl _vpaes_decrypt_2x 1138 eor v0.16b, v0.16b, v6.16b // xor with ivec 1139 eor v1.16b, v1.16b, v14.16b 1140 orr v6.16b, v15.16b, v15.16b 1141 st1 {v0.16b,v1.16b}, [x1], #32 1142 subs x17, x17, #32 1143 b.hi Lcbc_dec_loop2x 1144 1145Lcbc_dec_done: 1146 st1 {v6.16b}, [x4] 1147 1148 ldp d14,d15,[sp],#16 1149 ldp d12,d13,[sp],#16 1150 ldp d10,d11,[sp],#16 1151 ldp d8,d9,[sp],#16 1152 ldp x29,x30,[sp],#16 1153 AARCH64_VALIDATE_LINK_REGISTER 1154 ret 1155 1156.globl _vpaes_ctr32_encrypt_blocks 1157.private_extern _vpaes_ctr32_encrypt_blocks 1158 1159.align 4 1160_vpaes_ctr32_encrypt_blocks: 1161 AARCH64_SIGN_LINK_REGISTER 1162 stp x29,x30,[sp,#-16]! 1163 add x29,sp,#0 1164 stp d8,d9,[sp,#-16]! // ABI spec says so 1165 stp d10,d11,[sp,#-16]! 1166 stp d12,d13,[sp,#-16]! 1167 stp d14,d15,[sp,#-16]! 1168 1169 cbz x2, Lctr32_done 1170 1171 // Note, unlike the other functions, x2 here is measured in blocks, 1172 // not bytes. 1173 mov x17, x2 1174 mov x2, x3 1175 1176 // Load the IV and counter portion. 1177 ldr w6, [x4, #12] 1178 ld1 {v7.16b}, [x4] 1179 1180 bl _vpaes_encrypt_preheat 1181 tst x17, #1 1182 rev w6, w6 // The counter is big-endian. 1183 b.eq Lctr32_prep_loop 1184 1185 // Handle one block so the remaining block count is even for 1186 // _vpaes_encrypt_2x. 1187 ld1 {v6.16b}, [x0], #16 // Load input ahead of time 1188 bl _vpaes_encrypt_core 1189 eor v0.16b, v0.16b, v6.16b // XOR input and result 1190 st1 {v0.16b}, [x1], #16 1191 subs x17, x17, #1 1192 // Update the counter. 1193 add w6, w6, #1 1194 rev w7, w6 1195 mov v7.s[3], w7 1196 b.ls Lctr32_done 1197 1198Lctr32_prep_loop: 1199 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 1200 // uses v14 and v15. 1201 mov v15.16b, v7.16b 1202 mov v14.16b, v7.16b 1203 add w6, w6, #1 1204 rev w7, w6 1205 mov v15.s[3], w7 1206 1207Lctr32_loop: 1208 ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time 1209 bl _vpaes_encrypt_2x 1210 eor v0.16b, v0.16b, v6.16b // XOR input and result 1211 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 1212 st1 {v0.16b,v1.16b}, [x1], #32 1213 subs x17, x17, #2 1214 // Update the counter. 1215 add w7, w6, #1 1216 add w6, w6, #2 1217 rev w7, w7 1218 mov v14.s[3], w7 1219 rev w7, w6 1220 mov v15.s[3], w7 1221 b.hi Lctr32_loop 1222 1223Lctr32_done: 1224 ldp d14,d15,[sp],#16 1225 ldp d12,d13,[sp],#16 1226 ldp d10,d11,[sp],#16 1227 ldp d8,d9,[sp],#16 1228 ldp x29,x30,[sp],#16 1229 AARCH64_VALIDATE_LINK_REGISTER 1230 ret 1231 1232#endif // !OPENSSL_NO_ASM 1233