1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.section .rodata 19 20.type _vpaes_consts,%object 21.align 7 // totally strategic alignment 22_vpaes_consts: 23.Lk_mc_forward: // mc_forward 24.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 25.quad 0x080B0A0904070605, 0x000302010C0F0E0D 26.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 27.quad 0x000302010C0F0E0D, 0x080B0A0904070605 28.Lk_mc_backward: // mc_backward 29.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 30.quad 0x020100030E0D0C0F, 0x0A09080B06050407 31.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 32.quad 0x0A09080B06050407, 0x020100030E0D0C0F 33.Lk_sr: // sr 34.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 35.quad 0x030E09040F0A0500, 0x0B06010C07020D08 36.quad 0x0F060D040B020900, 0x070E050C030A0108 37.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 38 39// 40// "Hot" constants 41// 42.Lk_inv: // inv, inva 43.quad 0x0E05060F0D080180, 0x040703090A0B0C02 44.quad 0x01040A060F0B0780, 0x030D0E0C02050809 45.Lk_ipt: // input transform (lo, hi) 46.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 47.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 48.Lk_sbo: // sbou, sbot 49.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 50.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 51.Lk_sb1: // sb1u, sb1t 52.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 53.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 54.Lk_sb2: // sb2u, sb2t 55.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 56.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 57 58// 59// Decryption stuff 60// 61.Lk_dipt: // decryption input transform 62.quad 0x0F505B040B545F00, 0x154A411E114E451A 63.quad 0x86E383E660056500, 0x12771772F491F194 64.Lk_dsbo: // decryption sbox final output 65.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 66.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 67.Lk_dsb9: // decryption sbox output *9*u, *9*t 68.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 69.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 70.Lk_dsbd: // decryption sbox output *D*u, *D*t 71.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 72.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 73.Lk_dsbb: // decryption sbox output *B*u, *B*t 74.quad 0xD022649296B44200, 0x602646F6B0F2D404 75.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 76.Lk_dsbe: // decryption sbox output *E*u, *E*t 77.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 78.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 79 80// 81// Key schedule constants 82// 83.Lk_dksd: // decryption key schedule: invskew x*D 84.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 85.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 86.Lk_dksb: // decryption key schedule: invskew x*B 87.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 88.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 89.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 90.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 91.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 92.Lk_dks9: // decryption key schedule: invskew x*9 93.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 94.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 95 96.Lk_rcon: // rcon 97.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 98 99.Lk_opt: // output transform 100.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 101.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 102.Lk_deskew: // deskew tables: inverts the sbox's "skew" 103.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 104.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 105 106.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 107.align 2 108.size _vpaes_consts,.-_vpaes_consts 109.align 6 110 111.text 112## 113## _aes_preheat 114## 115## Fills register %r10 -> .aes_consts (so you can -fPIC) 116## and %xmm9-%xmm15 as specified below. 117## 118.type _vpaes_encrypt_preheat,%function 119.align 4 120_vpaes_encrypt_preheat: 121 adrp x10, .Lk_inv 122 add x10, x10, :lo12:.Lk_inv 123 movi v17.16b, #0x0f 124 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 125 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 126 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 127 ret 128.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 129 130## 131## _aes_encrypt_core 132## 133## AES-encrypt %xmm0. 134## 135## Inputs: 136## %xmm0 = input 137## %xmm9-%xmm15 as in _vpaes_preheat 138## (%rdx) = scheduled keys 139## 140## Output in %xmm0 141## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 142## Preserves %xmm6 - %xmm8 so you get some local vectors 143## 144## 145.type _vpaes_encrypt_core,%function 146.align 4 147_vpaes_encrypt_core: 148 mov x9, x2 149 ldr w8, [x2,#240] // pull rounds 150 adrp x11, .Lk_mc_forward+16 151 add x11, x11, :lo12:.Lk_mc_forward+16 152 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 153 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 154 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 155 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 156 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 157 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 158 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 159 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 160 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 161 b .Lenc_entry 162 163.align 4 164.Lenc_loop: 165 // middle of middle round 166 add x10, x11, #0x40 167 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 168 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 169 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 170 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 171 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 172 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 173 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 174 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 175 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 176 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 177 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 178 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 179 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 180 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 181 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 182 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 183 sub w8, w8, #1 // nr-- 184 185.Lenc_entry: 186 // top of round 187 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 188 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 189 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 190 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 191 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 192 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 193 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 194 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 195 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 196 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 197 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 198 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 199 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 200 cbnz w8, .Lenc_loop 201 202 // middle of last round 203 add x10, x11, #0x80 204 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 205 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 206 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 207 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 208 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 209 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 210 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 211 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 212 ret 213.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 214 215.globl vpaes_encrypt 216.hidden vpaes_encrypt 217.type vpaes_encrypt,%function 218.align 4 219vpaes_encrypt: 220 AARCH64_SIGN_LINK_REGISTER 221 stp x29,x30,[sp,#-16]! 222 add x29,sp,#0 223 224 ld1 {v7.16b}, [x0] 225 bl _vpaes_encrypt_preheat 226 bl _vpaes_encrypt_core 227 st1 {v0.16b}, [x1] 228 229 ldp x29,x30,[sp],#16 230 AARCH64_VALIDATE_LINK_REGISTER 231 ret 232.size vpaes_encrypt,.-vpaes_encrypt 233 234.type _vpaes_encrypt_2x,%function 235.align 4 236_vpaes_encrypt_2x: 237 mov x9, x2 238 ldr w8, [x2,#240] // pull rounds 239 adrp x11, .Lk_mc_forward+16 240 add x11, x11, :lo12:.Lk_mc_forward+16 241 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 242 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 243 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 244 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 245 and v9.16b, v15.16b, v17.16b 246 ushr v8.16b, v15.16b, #4 247 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 248 tbl v9.16b, {v20.16b}, v9.16b 249 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 250 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 251 tbl v10.16b, {v21.16b}, v8.16b 252 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 253 eor v8.16b, v9.16b, v16.16b 254 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 255 eor v8.16b, v8.16b, v10.16b 256 b .Lenc_2x_entry 257 258.align 4 259.Lenc_2x_loop: 260 // middle of middle round 261 add x10, x11, #0x40 262 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 263 tbl v12.16b, {v25.16b}, v10.16b 264 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 265 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 266 tbl v8.16b, {v24.16b}, v11.16b 267 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 268 eor v12.16b, v12.16b, v16.16b 269 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 270 tbl v13.16b, {v27.16b}, v10.16b 271 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 272 eor v8.16b, v8.16b, v12.16b 273 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 274 tbl v10.16b, {v26.16b}, v11.16b 275 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 276 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 277 tbl v11.16b, {v8.16b}, v1.16b 278 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 279 eor v10.16b, v10.16b, v13.16b 280 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 281 tbl v8.16b, {v8.16b}, v4.16b 282 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 283 eor v11.16b, v11.16b, v10.16b 284 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 285 tbl v12.16b, {v11.16b},v1.16b 286 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 287 eor v8.16b, v8.16b, v11.16b 288 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 289 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 290 eor v8.16b, v8.16b, v12.16b 291 sub w8, w8, #1 // nr-- 292 293.Lenc_2x_entry: 294 // top of round 295 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 296 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 297 and v9.16b, v8.16b, v17.16b 298 ushr v8.16b, v8.16b, #4 299 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 300 tbl v13.16b, {v19.16b},v9.16b 301 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 302 eor v9.16b, v9.16b, v8.16b 303 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 304 tbl v11.16b, {v18.16b},v8.16b 305 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 306 tbl v12.16b, {v18.16b},v9.16b 307 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 308 eor v11.16b, v11.16b, v13.16b 309 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 310 eor v12.16b, v12.16b, v13.16b 311 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 312 tbl v10.16b, {v18.16b},v11.16b 313 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 314 tbl v11.16b, {v18.16b},v12.16b 315 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 316 eor v10.16b, v10.16b, v9.16b 317 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 318 eor v11.16b, v11.16b, v8.16b 319 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 320 cbnz w8, .Lenc_2x_loop 321 322 // middle of last round 323 add x10, x11, #0x80 324 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 325 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 326 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 327 tbl v12.16b, {v22.16b}, v10.16b 328 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 329 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 330 tbl v8.16b, {v23.16b}, v11.16b 331 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 332 eor v12.16b, v12.16b, v16.16b 333 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 334 eor v8.16b, v8.16b, v12.16b 335 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 336 tbl v1.16b, {v8.16b},v1.16b 337 ret 338.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 339 340.type _vpaes_decrypt_preheat,%function 341.align 4 342_vpaes_decrypt_preheat: 343 adrp x10, .Lk_inv 344 add x10, x10, :lo12:.Lk_inv 345 movi v17.16b, #0x0f 346 adrp x11, .Lk_dipt 347 add x11, x11, :lo12:.Lk_dipt 348 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 349 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 350 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 351 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 352 ret 353.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 354 355## 356## Decryption core 357## 358## Same API as encryption core. 359## 360.type _vpaes_decrypt_core,%function 361.align 4 362_vpaes_decrypt_core: 363 mov x9, x2 364 ldr w8, [x2,#240] // pull rounds 365 366 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 367 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 368 eor x11, x11, #0x30 // xor $0x30, %r11 369 adrp x10, .Lk_sr 370 add x10, x10, :lo12:.Lk_sr 371 and x11, x11, #0x30 // and $0x30, %r11 372 add x11, x11, x10 373 adrp x10, .Lk_mc_forward+48 374 add x10, x10, :lo12:.Lk_mc_forward+48 375 376 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 377 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 378 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 379 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 380 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 381 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 382 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 383 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 384 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 385 b .Ldec_entry 386 387.align 4 388.Ldec_loop: 389// 390// Inverse mix columns 391// 392 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 393 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 394 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 395 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 396 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 397 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 398 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 399 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 400 401 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 402 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 403 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 404 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 405 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 406 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 407 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 408 409 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 410 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 411 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 412 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 413 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 414 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 415 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 416 417 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 418 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 419 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 420 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 421 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 422 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 423 sub w8, w8, #1 // sub $1,%rax # nr-- 424 425.Ldec_entry: 426 // top of round 427 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 428 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 429 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 430 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 431 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 432 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 433 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 434 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 435 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 436 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 437 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 438 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 439 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 440 cbnz w8, .Ldec_loop 441 442 // middle of last round 443 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 444 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 445 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 446 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 447 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 448 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 449 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 450 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 451 ret 452.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 453 454.globl vpaes_decrypt 455.hidden vpaes_decrypt 456.type vpaes_decrypt,%function 457.align 4 458vpaes_decrypt: 459 AARCH64_SIGN_LINK_REGISTER 460 stp x29,x30,[sp,#-16]! 461 add x29,sp,#0 462 463 ld1 {v7.16b}, [x0] 464 bl _vpaes_decrypt_preheat 465 bl _vpaes_decrypt_core 466 st1 {v0.16b}, [x1] 467 468 ldp x29,x30,[sp],#16 469 AARCH64_VALIDATE_LINK_REGISTER 470 ret 471.size vpaes_decrypt,.-vpaes_decrypt 472 473// v14-v15 input, v0-v1 output 474.type _vpaes_decrypt_2x,%function 475.align 4 476_vpaes_decrypt_2x: 477 mov x9, x2 478 ldr w8, [x2,#240] // pull rounds 479 480 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 481 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 482 eor x11, x11, #0x30 // xor $0x30, %r11 483 adrp x10, .Lk_sr 484 add x10, x10, :lo12:.Lk_sr 485 and x11, x11, #0x30 // and $0x30, %r11 486 add x11, x11, x10 487 adrp x10, .Lk_mc_forward+48 488 add x10, x10, :lo12:.Lk_mc_forward+48 489 490 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 491 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 492 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 493 and v9.16b, v15.16b, v17.16b 494 ushr v8.16b, v15.16b, #4 495 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 496 tbl v10.16b, {v20.16b},v9.16b 497 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 498 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 499 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 500 tbl v8.16b, {v21.16b},v8.16b 501 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 502 eor v10.16b, v10.16b, v16.16b 503 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 504 eor v8.16b, v8.16b, v10.16b 505 b .Ldec_2x_entry 506 507.align 4 508.Ldec_2x_loop: 509// 510// Inverse mix columns 511// 512 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 513 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 514 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 515 tbl v12.16b, {v24.16b}, v10.16b 516 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 517 tbl v9.16b, {v25.16b}, v11.16b 518 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 519 eor v8.16b, v12.16b, v16.16b 520 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 521 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 522 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 523 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 524 525 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 526 tbl v12.16b, {v26.16b}, v10.16b 527 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 528 tbl v8.16b, {v8.16b},v5.16b 529 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 530 tbl v9.16b, {v27.16b}, v11.16b 531 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 532 eor v8.16b, v8.16b, v12.16b 533 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 534 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 535 eor v8.16b, v8.16b, v9.16b 536 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 537 538 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 539 tbl v12.16b, {v28.16b}, v10.16b 540 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 541 tbl v8.16b, {v8.16b},v5.16b 542 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 543 tbl v9.16b, {v29.16b}, v11.16b 544 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 545 eor v8.16b, v8.16b, v12.16b 546 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 547 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 548 eor v8.16b, v8.16b, v9.16b 549 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 550 551 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 552 tbl v12.16b, {v30.16b}, v10.16b 553 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 554 tbl v8.16b, {v8.16b},v5.16b 555 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 556 tbl v9.16b, {v31.16b}, v11.16b 557 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 558 eor v8.16b, v8.16b, v12.16b 559 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 560 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 561 eor v8.16b, v8.16b, v9.16b 562 sub w8, w8, #1 // sub $1,%rax # nr-- 563 564.Ldec_2x_entry: 565 // top of round 566 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 567 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 568 and v9.16b, v8.16b, v17.16b 569 ushr v8.16b, v8.16b, #4 570 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 571 tbl v10.16b, {v19.16b},v9.16b 572 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 573 eor v9.16b, v9.16b, v8.16b 574 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 575 tbl v11.16b, {v18.16b},v8.16b 576 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 577 tbl v12.16b, {v18.16b},v9.16b 578 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 579 eor v11.16b, v11.16b, v10.16b 580 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 581 eor v12.16b, v12.16b, v10.16b 582 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 583 tbl v10.16b, {v18.16b},v11.16b 584 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 585 tbl v11.16b, {v18.16b},v12.16b 586 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 587 eor v10.16b, v10.16b, v9.16b 588 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 589 eor v11.16b, v11.16b, v8.16b 590 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 591 cbnz w8, .Ldec_2x_loop 592 593 // middle of last round 594 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 595 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 596 tbl v12.16b, {v22.16b}, v10.16b 597 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 598 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 599 tbl v9.16b, {v23.16b}, v11.16b 600 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 601 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 602 eor v12.16b, v12.16b, v16.16b 603 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 604 eor v8.16b, v9.16b, v12.16b 605 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 606 tbl v1.16b, {v8.16b},v2.16b 607 ret 608.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 609######################################################## 610## ## 611## AES key schedule ## 612## ## 613######################################################## 614.type _vpaes_key_preheat,%function 615.align 4 616_vpaes_key_preheat: 617 adrp x10, .Lk_inv 618 add x10, x10, :lo12:.Lk_inv 619 movi v16.16b, #0x5b // .Lk_s63 620 adrp x11, .Lk_sb1 621 add x11, x11, :lo12:.Lk_sb1 622 movi v17.16b, #0x0f // .Lk_s0F 623 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 624 adrp x10, .Lk_dksd 625 add x10, x10, :lo12:.Lk_dksd 626 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 627 adrp x11, .Lk_mc_forward 628 add x11, x11, :lo12:.Lk_mc_forward 629 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 630 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 631 ld1 {v8.2d}, [x10] // .Lk_rcon 632 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 633 ret 634.size _vpaes_key_preheat,.-_vpaes_key_preheat 635 636.type _vpaes_schedule_core,%function 637.align 4 638_vpaes_schedule_core: 639 AARCH64_SIGN_LINK_REGISTER 640 stp x29, x30, [sp,#-16]! 641 add x29,sp,#0 642 643 bl _vpaes_key_preheat // load the tables 644 645 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 646 647 // input transform 648 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 649 bl _vpaes_schedule_transform 650 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 651 652 adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 653 add x10, x10, :lo12:.Lk_sr 654 655 add x8, x8, x10 656 cbnz w3, .Lschedule_am_decrypting 657 658 // encrypting, output zeroth round key after transform 659 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 660 b .Lschedule_go 661 662.Lschedule_am_decrypting: 663 // decrypting, output zeroth round key after shiftrows 664 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 665 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 666 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 667 eor x8, x8, #0x30 // xor $0x30, %r8 668 669.Lschedule_go: 670 cmp w1, #192 // cmp $192, %esi 671 b.hi .Lschedule_256 672 b.eq .Lschedule_192 673 // 128: fall though 674 675## 676## .schedule_128 677## 678## 128-bit specific part of key schedule. 679## 680## This schedule is really simple, because all its parts 681## are accomplished by the subroutines. 682## 683.Lschedule_128: 684 mov x0, #10 // mov $10, %esi 685 686.Loop_schedule_128: 687 sub x0, x0, #1 // dec %esi 688 bl _vpaes_schedule_round 689 cbz x0, .Lschedule_mangle_last 690 bl _vpaes_schedule_mangle // write output 691 b .Loop_schedule_128 692 693## 694## .aes_schedule_192 695## 696## 192-bit specific part of key schedule. 697## 698## The main body of this schedule is the same as the 128-bit 699## schedule, but with more smearing. The long, high side is 700## stored in %xmm7 as before, and the short, low side is in 701## the high bits of %xmm6. 702## 703## This schedule is somewhat nastier, however, because each 704## round produces 192 bits of key material, or 1.5 round keys. 705## Therefore, on each cycle we do 2 rounds and produce 3 round 706## keys. 707## 708.align 4 709.Lschedule_192: 710 sub x0, x0, #8 711 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 712 bl _vpaes_schedule_transform // input transform 713 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 714 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 715 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 716 mov x0, #4 // mov $4, %esi 717 718.Loop_schedule_192: 719 sub x0, x0, #1 // dec %esi 720 bl _vpaes_schedule_round 721 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 722 bl _vpaes_schedule_mangle // save key n 723 bl _vpaes_schedule_192_smear 724 bl _vpaes_schedule_mangle // save key n+1 725 bl _vpaes_schedule_round 726 cbz x0, .Lschedule_mangle_last 727 bl _vpaes_schedule_mangle // save key n+2 728 bl _vpaes_schedule_192_smear 729 b .Loop_schedule_192 730 731## 732## .aes_schedule_256 733## 734## 256-bit specific part of key schedule. 735## 736## The structure here is very similar to the 128-bit 737## schedule, but with an additional "low side" in 738## %xmm6. The low side's rounds are the same as the 739## high side's, except no rcon and no rotation. 740## 741.align 4 742.Lschedule_256: 743 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 744 bl _vpaes_schedule_transform // input transform 745 mov x0, #7 // mov $7, %esi 746 747.Loop_schedule_256: 748 sub x0, x0, #1 // dec %esi 749 bl _vpaes_schedule_mangle // output low result 750 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 751 752 // high round 753 bl _vpaes_schedule_round 754 cbz x0, .Lschedule_mangle_last 755 bl _vpaes_schedule_mangle 756 757 // low round. swap xmm7 and xmm6 758 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 759 movi v4.16b, #0 760 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 761 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 762 bl _vpaes_schedule_low_round 763 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 764 765 b .Loop_schedule_256 766 767## 768## .aes_schedule_mangle_last 769## 770## Mangler for last round of key schedule 771## Mangles %xmm0 772## when encrypting, outputs out(%xmm0) ^ 63 773## when decrypting, outputs unskew(%xmm0) 774## 775## Always called right before return... jumps to cleanup and exits 776## 777.align 4 778.Lschedule_mangle_last: 779 // schedule last round key from xmm0 780 adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 781 add x11, x11, :lo12:.Lk_deskew 782 783 cbnz w3, .Lschedule_mangle_last_dec 784 785 // encrypting 786 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 787 adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 788 add x11, x11, :lo12:.Lk_opt 789 add x2, x2, #32 // add $32, %rdx 790 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 791 792.Lschedule_mangle_last_dec: 793 ld1 {v20.2d,v21.2d}, [x11] // reload constants 794 sub x2, x2, #16 // add $-16, %rdx 795 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 796 bl _vpaes_schedule_transform // output transform 797 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 798 799 // cleanup 800 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 801 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 802 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 803 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 804 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 805 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 806 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 807 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 808 ldp x29, x30, [sp],#16 809 AARCH64_VALIDATE_LINK_REGISTER 810 ret 811.size _vpaes_schedule_core,.-_vpaes_schedule_core 812 813## 814## .aes_schedule_192_smear 815## 816## Smear the short, low side in the 192-bit key schedule. 817## 818## Inputs: 819## %xmm7: high side, b a x y 820## %xmm6: low side, d c 0 0 821## %xmm13: 0 822## 823## Outputs: 824## %xmm6: b+c+d b+c 0 0 825## %xmm0: b+c+d b+c b a 826## 827.type _vpaes_schedule_192_smear,%function 828.align 4 829_vpaes_schedule_192_smear: 830 movi v1.16b, #0 831 dup v0.4s, v7.s[3] 832 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 833 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 834 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 835 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 836 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 837 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 838 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 839 ret 840.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 841 842## 843## .aes_schedule_round 844## 845## Runs one main round of the key schedule on %xmm0, %xmm7 846## 847## Specifically, runs subbytes on the high dword of %xmm0 848## then rotates it by one byte and xors into the low dword of 849## %xmm7. 850## 851## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 852## next rcon. 853## 854## Smears the dwords of %xmm7 by xoring the low into the 855## second low, result into third, result into highest. 856## 857## Returns results in %xmm7 = %xmm0. 858## Clobbers %xmm1-%xmm4, %r11. 859## 860.type _vpaes_schedule_round,%function 861.align 4 862_vpaes_schedule_round: 863 // extract rcon from xmm8 864 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 865 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 866 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 867 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 868 869 // rotate 870 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 871 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 872 873 // fall through... 874 875 // low round: same as high round, but no rotation and no rcon. 876_vpaes_schedule_low_round: 877 // smear xmm7 878 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 879 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 880 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 881 882 // subbytes 883 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 884 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 885 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 886 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 887 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 888 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 889 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 890 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 891 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 892 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 893 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 894 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 895 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 896 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 897 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 898 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 899 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 900 901 // add in smeared stuff 902 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 903 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 904 ret 905.size _vpaes_schedule_round,.-_vpaes_schedule_round 906 907## 908## .aes_schedule_transform 909## 910## Linear-transform %xmm0 according to tables at (%r11) 911## 912## Requires that %xmm9 = 0x0F0F... as in preheat 913## Output in %xmm0 914## Clobbers %xmm1, %xmm2 915## 916.type _vpaes_schedule_transform,%function 917.align 4 918_vpaes_schedule_transform: 919 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 920 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 921 // vmovdqa (%r11), %xmm2 # lo 922 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 923 // vmovdqa 16(%r11), %xmm1 # hi 924 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 925 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 926 ret 927.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 928 929## 930## .aes_schedule_mangle 931## 932## Mangle xmm0 from (basis-transformed) standard version 933## to our version. 934## 935## On encrypt, 936## xor with 0x63 937## multiply by circulant 0,1,1,1 938## apply shiftrows transform 939## 940## On decrypt, 941## xor with 0x63 942## multiply by "inverse mixcolumns" circulant E,B,D,9 943## deskew 944## apply shiftrows transform 945## 946## 947## Writes out to (%rdx), and increments or decrements it 948## Keeps track of round number mod 4 in %r8 949## Preserves xmm0 950## Clobbers xmm1-xmm5 951## 952.type _vpaes_schedule_mangle,%function 953.align 4 954_vpaes_schedule_mangle: 955 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 956 // vmovdqa .Lk_mc_forward(%rip),%xmm5 957 cbnz w3, .Lschedule_mangle_dec 958 959 // encrypting 960 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 961 add x2, x2, #16 // add $16, %rdx 962 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 963 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 964 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 965 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 966 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 967 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 968 969 b .Lschedule_mangle_both 970.align 4 971.Lschedule_mangle_dec: 972 // inverse mix columns 973 // lea .Lk_dksd(%rip),%r11 974 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 975 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 976 977 // vmovdqa 0x00(%r11), %xmm2 978 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 979 // vmovdqa 0x10(%r11), %xmm3 980 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 981 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 982 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 983 984 // vmovdqa 0x20(%r11), %xmm2 985 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 986 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 987 // vmovdqa 0x30(%r11), %xmm3 988 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 989 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 990 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 991 992 // vmovdqa 0x40(%r11), %xmm2 993 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 994 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 995 // vmovdqa 0x50(%r11), %xmm3 996 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 997 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 998 999 // vmovdqa 0x60(%r11), %xmm2 1000 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 1001 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 1002 // vmovdqa 0x70(%r11), %xmm4 1003 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 1004 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 1005 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 1006 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 1007 1008 sub x2, x2, #16 // add $-16, %rdx 1009 1010.Lschedule_mangle_both: 1011 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1012 add x8, x8, #48 // add $-16, %r8 1013 and x8, x8, #~(1<<6) // and $0x30, %r8 1014 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 1015 ret 1016.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1017 1018.globl vpaes_set_encrypt_key 1019.hidden vpaes_set_encrypt_key 1020.type vpaes_set_encrypt_key,%function 1021.align 4 1022vpaes_set_encrypt_key: 1023 AARCH64_SIGN_LINK_REGISTER 1024 stp x29,x30,[sp,#-16]! 1025 add x29,sp,#0 1026 stp d8,d9,[sp,#-16]! // ABI spec says so 1027 1028 lsr w9, w1, #5 // shr $5,%eax 1029 add w9, w9, #5 // $5,%eax 1030 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1031 1032 mov w3, #0 // mov $0,%ecx 1033 mov x8, #0x30 // mov $0x30,%r8d 1034 bl _vpaes_schedule_core 1035 eor x0, x0, x0 1036 1037 ldp d8,d9,[sp],#16 1038 ldp x29,x30,[sp],#16 1039 AARCH64_VALIDATE_LINK_REGISTER 1040 ret 1041.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1042 1043.globl vpaes_set_decrypt_key 1044.hidden vpaes_set_decrypt_key 1045.type vpaes_set_decrypt_key,%function 1046.align 4 1047vpaes_set_decrypt_key: 1048 AARCH64_SIGN_LINK_REGISTER 1049 stp x29,x30,[sp,#-16]! 1050 add x29,sp,#0 1051 stp d8,d9,[sp,#-16]! // ABI spec says so 1052 1053 lsr w9, w1, #5 // shr $5,%eax 1054 add w9, w9, #5 // $5,%eax 1055 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1056 lsl w9, w9, #4 // shl $4,%eax 1057 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1058 add x2, x2, x9 1059 1060 mov w3, #1 // mov $1,%ecx 1061 lsr w8, w1, #1 // shr $1,%r8d 1062 and x8, x8, #32 // and $32,%r8d 1063 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1064 bl _vpaes_schedule_core 1065 1066 ldp d8,d9,[sp],#16 1067 ldp x29,x30,[sp],#16 1068 AARCH64_VALIDATE_LINK_REGISTER 1069 ret 1070.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1071.globl vpaes_cbc_encrypt 1072.hidden vpaes_cbc_encrypt 1073.type vpaes_cbc_encrypt,%function 1074.align 4 1075vpaes_cbc_encrypt: 1076 AARCH64_SIGN_LINK_REGISTER 1077 cbz x2, .Lcbc_abort 1078 cmp w5, #0 // check direction 1079 b.eq vpaes_cbc_decrypt 1080 1081 stp x29,x30,[sp,#-16]! 1082 add x29,sp,#0 1083 1084 mov x17, x2 // reassign 1085 mov x2, x3 // reassign 1086 1087 ld1 {v0.16b}, [x4] // load ivec 1088 bl _vpaes_encrypt_preheat 1089 b .Lcbc_enc_loop 1090 1091.align 4 1092.Lcbc_enc_loop: 1093 ld1 {v7.16b}, [x0],#16 // load input 1094 eor v7.16b, v7.16b, v0.16b // xor with ivec 1095 bl _vpaes_encrypt_core 1096 st1 {v0.16b}, [x1],#16 // save output 1097 subs x17, x17, #16 1098 b.hi .Lcbc_enc_loop 1099 1100 st1 {v0.16b}, [x4] // write ivec 1101 1102 ldp x29,x30,[sp],#16 1103 AARCH64_VALIDATE_LINK_REGISTER 1104.Lcbc_abort: 1105 ret 1106.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1107 1108.type vpaes_cbc_decrypt,%function 1109.align 4 1110vpaes_cbc_decrypt: 1111 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1112 // only from vpaes_cbc_encrypt which has already signed the return address. 1113 stp x29,x30,[sp,#-16]! 1114 add x29,sp,#0 1115 stp d8,d9,[sp,#-16]! // ABI spec says so 1116 stp d10,d11,[sp,#-16]! 1117 stp d12,d13,[sp,#-16]! 1118 stp d14,d15,[sp,#-16]! 1119 1120 mov x17, x2 // reassign 1121 mov x2, x3 // reassign 1122 ld1 {v6.16b}, [x4] // load ivec 1123 bl _vpaes_decrypt_preheat 1124 tst x17, #16 1125 b.eq .Lcbc_dec_loop2x 1126 1127 ld1 {v7.16b}, [x0], #16 // load input 1128 bl _vpaes_decrypt_core 1129 eor v0.16b, v0.16b, v6.16b // xor with ivec 1130 orr v6.16b, v7.16b, v7.16b // next ivec value 1131 st1 {v0.16b}, [x1], #16 1132 subs x17, x17, #16 1133 b.ls .Lcbc_dec_done 1134 1135.align 4 1136.Lcbc_dec_loop2x: 1137 ld1 {v14.16b,v15.16b}, [x0], #32 1138 bl _vpaes_decrypt_2x 1139 eor v0.16b, v0.16b, v6.16b // xor with ivec 1140 eor v1.16b, v1.16b, v14.16b 1141 orr v6.16b, v15.16b, v15.16b 1142 st1 {v0.16b,v1.16b}, [x1], #32 1143 subs x17, x17, #32 1144 b.hi .Lcbc_dec_loop2x 1145 1146.Lcbc_dec_done: 1147 st1 {v6.16b}, [x4] 1148 1149 ldp d14,d15,[sp],#16 1150 ldp d12,d13,[sp],#16 1151 ldp d10,d11,[sp],#16 1152 ldp d8,d9,[sp],#16 1153 ldp x29,x30,[sp],#16 1154 AARCH64_VALIDATE_LINK_REGISTER 1155 ret 1156.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1157.globl vpaes_ctr32_encrypt_blocks 1158.hidden vpaes_ctr32_encrypt_blocks 1159.type vpaes_ctr32_encrypt_blocks,%function 1160.align 4 1161vpaes_ctr32_encrypt_blocks: 1162 AARCH64_SIGN_LINK_REGISTER 1163 stp x29,x30,[sp,#-16]! 1164 add x29,sp,#0 1165 stp d8,d9,[sp,#-16]! // ABI spec says so 1166 stp d10,d11,[sp,#-16]! 1167 stp d12,d13,[sp,#-16]! 1168 stp d14,d15,[sp,#-16]! 1169 1170 cbz x2, .Lctr32_done 1171 1172 // Note, unlike the other functions, x2 here is measured in blocks, 1173 // not bytes. 1174 mov x17, x2 1175 mov x2, x3 1176 1177 // Load the IV and counter portion. 1178 ldr w6, [x4, #12] 1179 ld1 {v7.16b}, [x4] 1180 1181 bl _vpaes_encrypt_preheat 1182 tst x17, #1 1183 rev w6, w6 // The counter is big-endian. 1184 b.eq .Lctr32_prep_loop 1185 1186 // Handle one block so the remaining block count is even for 1187 // _vpaes_encrypt_2x. 1188 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time 1189 bl _vpaes_encrypt_core 1190 eor v0.16b, v0.16b, v6.16b // XOR input and result 1191 st1 {v0.16b}, [x1], #16 1192 subs x17, x17, #1 1193 // Update the counter. 1194 add w6, w6, #1 1195 rev w7, w6 1196 mov v7.s[3], w7 1197 b.ls .Lctr32_done 1198 1199.Lctr32_prep_loop: 1200 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 1201 // uses v14 and v15. 1202 mov v15.16b, v7.16b 1203 mov v14.16b, v7.16b 1204 add w6, w6, #1 1205 rev w7, w6 1206 mov v15.s[3], w7 1207 1208.Lctr32_loop: 1209 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time 1210 bl _vpaes_encrypt_2x 1211 eor v0.16b, v0.16b, v6.16b // XOR input and result 1212 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 1213 st1 {v0.16b,v1.16b}, [x1], #32 1214 subs x17, x17, #2 1215 // Update the counter. 1216 add w7, w6, #1 1217 add w6, w6, #2 1218 rev w7, w7 1219 mov v14.s[3], w7 1220 rev w7, w6 1221 mov v15.s[3], w7 1222 b.hi .Lctr32_loop 1223 1224.Lctr32_done: 1225 ldp d14,d15,[sp],#16 1226 ldp d12,d13,[sp],#16 1227 ldp d10,d11,[sp],#16 1228 ldp d8,d9,[sp],#16 1229 ldp x29,x30,[sp],#16 1230 AARCH64_VALIDATE_LINK_REGISTER 1231 ret 1232.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks 1233#endif 1234#endif // !OPENSSL_NO_ASM 1235.section .note.GNU-stack,"",%progbits 1236