1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16.section .rodata 17 18.type _vpaes_consts,%object 19.align 7 // totally strategic alignment 20_vpaes_consts: 21.Lk_mc_forward: // mc_forward 22.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 23.quad 0x080B0A0904070605, 0x000302010C0F0E0D 24.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 25.quad 0x000302010C0F0E0D, 0x080B0A0904070605 26.Lk_mc_backward: // mc_backward 27.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 28.quad 0x020100030E0D0C0F, 0x0A09080B06050407 29.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 30.quad 0x0A09080B06050407, 0x020100030E0D0C0F 31.Lk_sr: // sr 32.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 33.quad 0x030E09040F0A0500, 0x0B06010C07020D08 34.quad 0x0F060D040B020900, 0x070E050C030A0108 35.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 36 37// 38// "Hot" constants 39// 40.Lk_inv: // inv, inva 41.quad 0x0E05060F0D080180, 0x040703090A0B0C02 42.quad 0x01040A060F0B0780, 0x030D0E0C02050809 43.Lk_ipt: // input transform (lo, hi) 44.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 45.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 46.Lk_sbo: // sbou, sbot 47.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 48.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 49.Lk_sb1: // sb1u, sb1t 50.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 51.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 52.Lk_sb2: // sb2u, sb2t 53.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 54.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 55 56// 57// Decryption stuff 58// 59.Lk_dipt: // decryption input transform 60.quad 0x0F505B040B545F00, 0x154A411E114E451A 61.quad 0x86E383E660056500, 0x12771772F491F194 62.Lk_dsbo: // decryption sbox final output 63.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 64.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 65.Lk_dsb9: // decryption sbox output *9*u, *9*t 66.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 67.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 68.Lk_dsbd: // decryption sbox output *D*u, *D*t 69.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 70.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 71.Lk_dsbb: // decryption sbox output *B*u, *B*t 72.quad 0xD022649296B44200, 0x602646F6B0F2D404 73.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 74.Lk_dsbe: // decryption sbox output *E*u, *E*t 75.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 76.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 77 78// 79// Key schedule constants 80// 81.Lk_dksd: // decryption key schedule: invskew x*D 82.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 83.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 84.Lk_dksb: // decryption key schedule: invskew x*B 85.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 86.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 87.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 88.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 89.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 90.Lk_dks9: // decryption key schedule: invskew x*9 91.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 92.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 93 94.Lk_rcon: // rcon 95.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 96 97.Lk_opt: // output transform 98.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 99.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 100.Lk_deskew: // deskew tables: inverts the sbox's "skew" 101.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 102.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 103 104.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 105.align 2 106.size _vpaes_consts,.-_vpaes_consts 107.align 6 108 109.text 110## 111## _aes_preheat 112## 113## Fills register %r10 -> .aes_consts (so you can -fPIC) 114## and %xmm9-%xmm15 as specified below. 115## 116.type _vpaes_encrypt_preheat,%function 117.align 4 118_vpaes_encrypt_preheat: 119 adrp x10, .Lk_inv 120 add x10, x10, :lo12:.Lk_inv 121 movi v17.16b, #0x0f 122 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 123 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 124 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 125 ret 126.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 127 128## 129## _aes_encrypt_core 130## 131## AES-encrypt %xmm0. 132## 133## Inputs: 134## %xmm0 = input 135## %xmm9-%xmm15 as in _vpaes_preheat 136## (%rdx) = scheduled keys 137## 138## Output in %xmm0 139## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 140## Preserves %xmm6 - %xmm8 so you get some local vectors 141## 142## 143.type _vpaes_encrypt_core,%function 144.align 4 145_vpaes_encrypt_core: 146 mov x9, x2 147 ldr w8, [x2,#240] // pull rounds 148 adrp x11, .Lk_mc_forward+16 149 add x11, x11, :lo12:.Lk_mc_forward+16 150 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 151 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 152 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 153 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 154 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 155 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 156 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 157 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 158 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 159 b .Lenc_entry 160 161.align 4 162.Lenc_loop: 163 // middle of middle round 164 add x10, x11, #0x40 165 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 166 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 167 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 168 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 169 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 170 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 171 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 172 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 173 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 174 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 175 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 176 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 177 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 178 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 179 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 180 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 181 sub w8, w8, #1 // nr-- 182 183.Lenc_entry: 184 // top of round 185 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 186 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 187 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 188 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 189 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 190 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 191 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 192 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 193 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 194 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 195 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 196 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 197 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 198 cbnz w8, .Lenc_loop 199 200 // middle of last round 201 add x10, x11, #0x80 202 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 203 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 204 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 205 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 206 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 207 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 208 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 209 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 210 ret 211.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 212 213.globl vpaes_encrypt 214.hidden vpaes_encrypt 215.type vpaes_encrypt,%function 216.align 4 217vpaes_encrypt: 218 stp x29,x30,[sp,#-16]! 219 add x29,sp,#0 220 221 ld1 {v7.16b}, [x0] 222 bl _vpaes_encrypt_preheat 223 bl _vpaes_encrypt_core 224 st1 {v0.16b}, [x1] 225 226 ldp x29,x30,[sp],#16 227 ret 228.size vpaes_encrypt,.-vpaes_encrypt 229 230.type _vpaes_encrypt_2x,%function 231.align 4 232_vpaes_encrypt_2x: 233 mov x9, x2 234 ldr w8, [x2,#240] // pull rounds 235 adrp x11, .Lk_mc_forward+16 236 add x11, x11, :lo12:.Lk_mc_forward+16 237 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 238 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 239 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 240 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 241 and v9.16b, v15.16b, v17.16b 242 ushr v8.16b, v15.16b, #4 243 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 244 tbl v9.16b, {v20.16b}, v9.16b 245 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 246 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 247 tbl v10.16b, {v21.16b}, v8.16b 248 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 249 eor v8.16b, v9.16b, v16.16b 250 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 251 eor v8.16b, v8.16b, v10.16b 252 b .Lenc_2x_entry 253 254.align 4 255.Lenc_2x_loop: 256 // middle of middle round 257 add x10, x11, #0x40 258 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 259 tbl v12.16b, {v25.16b}, v10.16b 260 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 261 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 262 tbl v8.16b, {v24.16b}, v11.16b 263 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 264 eor v12.16b, v12.16b, v16.16b 265 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 266 tbl v13.16b, {v27.16b}, v10.16b 267 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 268 eor v8.16b, v8.16b, v12.16b 269 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 270 tbl v10.16b, {v26.16b}, v11.16b 271 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 272 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 273 tbl v11.16b, {v8.16b}, v1.16b 274 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 275 eor v10.16b, v10.16b, v13.16b 276 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 277 tbl v8.16b, {v8.16b}, v4.16b 278 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 279 eor v11.16b, v11.16b, v10.16b 280 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 281 tbl v12.16b, {v11.16b},v1.16b 282 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 283 eor v8.16b, v8.16b, v11.16b 284 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 285 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 286 eor v8.16b, v8.16b, v12.16b 287 sub w8, w8, #1 // nr-- 288 289.Lenc_2x_entry: 290 // top of round 291 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 292 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 293 and v9.16b, v8.16b, v17.16b 294 ushr v8.16b, v8.16b, #4 295 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 296 tbl v13.16b, {v19.16b},v9.16b 297 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 298 eor v9.16b, v9.16b, v8.16b 299 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 300 tbl v11.16b, {v18.16b},v8.16b 301 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 302 tbl v12.16b, {v18.16b},v9.16b 303 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 304 eor v11.16b, v11.16b, v13.16b 305 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 306 eor v12.16b, v12.16b, v13.16b 307 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 308 tbl v10.16b, {v18.16b},v11.16b 309 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 310 tbl v11.16b, {v18.16b},v12.16b 311 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 312 eor v10.16b, v10.16b, v9.16b 313 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 314 eor v11.16b, v11.16b, v8.16b 315 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 316 cbnz w8, .Lenc_2x_loop 317 318 // middle of last round 319 add x10, x11, #0x80 320 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 321 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 322 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 323 tbl v12.16b, {v22.16b}, v10.16b 324 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 325 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 326 tbl v8.16b, {v23.16b}, v11.16b 327 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 328 eor v12.16b, v12.16b, v16.16b 329 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 330 eor v8.16b, v8.16b, v12.16b 331 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 332 tbl v1.16b, {v8.16b},v1.16b 333 ret 334.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 335 336.type _vpaes_decrypt_preheat,%function 337.align 4 338_vpaes_decrypt_preheat: 339 adrp x10, .Lk_inv 340 add x10, x10, :lo12:.Lk_inv 341 movi v17.16b, #0x0f 342 adrp x11, .Lk_dipt 343 add x11, x11, :lo12:.Lk_dipt 344 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 345 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 346 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 347 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 348 ret 349.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 350 351## 352## Decryption core 353## 354## Same API as encryption core. 355## 356.type _vpaes_decrypt_core,%function 357.align 4 358_vpaes_decrypt_core: 359 mov x9, x2 360 ldr w8, [x2,#240] // pull rounds 361 362 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 363 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 364 eor x11, x11, #0x30 // xor $0x30, %r11 365 adrp x10, .Lk_sr 366 add x10, x10, :lo12:.Lk_sr 367 and x11, x11, #0x30 // and $0x30, %r11 368 add x11, x11, x10 369 adrp x10, .Lk_mc_forward+48 370 add x10, x10, :lo12:.Lk_mc_forward+48 371 372 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 373 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 374 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 375 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 376 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 377 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 378 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 379 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 380 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 381 b .Ldec_entry 382 383.align 4 384.Ldec_loop: 385// 386// Inverse mix columns 387// 388 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 389 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 390 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 391 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 392 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 393 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 394 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 395 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 396 397 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 398 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 399 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 400 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 401 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 402 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 403 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 404 405 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 406 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 407 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 408 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 409 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 410 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 411 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 412 413 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 414 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 415 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 416 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 417 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 418 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 419 sub w8, w8, #1 // sub $1,%rax # nr-- 420 421.Ldec_entry: 422 // top of round 423 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 424 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 425 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 426 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 427 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 428 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 429 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 430 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 431 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 432 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 433 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 434 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 435 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 436 cbnz w8, .Ldec_loop 437 438 // middle of last round 439 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 440 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 441 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 442 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 443 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 444 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 445 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 446 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 447 ret 448.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 449 450.globl vpaes_decrypt 451.hidden vpaes_decrypt 452.type vpaes_decrypt,%function 453.align 4 454vpaes_decrypt: 455 stp x29,x30,[sp,#-16]! 456 add x29,sp,#0 457 458 ld1 {v7.16b}, [x0] 459 bl _vpaes_decrypt_preheat 460 bl _vpaes_decrypt_core 461 st1 {v0.16b}, [x1] 462 463 ldp x29,x30,[sp],#16 464 ret 465.size vpaes_decrypt,.-vpaes_decrypt 466 467// v14-v15 input, v0-v1 output 468.type _vpaes_decrypt_2x,%function 469.align 4 470_vpaes_decrypt_2x: 471 mov x9, x2 472 ldr w8, [x2,#240] // pull rounds 473 474 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 475 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 476 eor x11, x11, #0x30 // xor $0x30, %r11 477 adrp x10, .Lk_sr 478 add x10, x10, :lo12:.Lk_sr 479 and x11, x11, #0x30 // and $0x30, %r11 480 add x11, x11, x10 481 adrp x10, .Lk_mc_forward+48 482 add x10, x10, :lo12:.Lk_mc_forward+48 483 484 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 485 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 486 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 487 and v9.16b, v15.16b, v17.16b 488 ushr v8.16b, v15.16b, #4 489 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 490 tbl v10.16b, {v20.16b},v9.16b 491 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 492 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 493 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 494 tbl v8.16b, {v21.16b},v8.16b 495 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 496 eor v10.16b, v10.16b, v16.16b 497 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 498 eor v8.16b, v8.16b, v10.16b 499 b .Ldec_2x_entry 500 501.align 4 502.Ldec_2x_loop: 503// 504// Inverse mix columns 505// 506 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 507 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 508 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 509 tbl v12.16b, {v24.16b}, v10.16b 510 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 511 tbl v9.16b, {v25.16b}, v11.16b 512 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 513 eor v8.16b, v12.16b, v16.16b 514 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 515 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 516 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 517 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 518 519 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 520 tbl v12.16b, {v26.16b}, v10.16b 521 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 522 tbl v8.16b, {v8.16b},v5.16b 523 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 524 tbl v9.16b, {v27.16b}, v11.16b 525 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 526 eor v8.16b, v8.16b, v12.16b 527 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 528 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 529 eor v8.16b, v8.16b, v9.16b 530 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 531 532 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 533 tbl v12.16b, {v28.16b}, v10.16b 534 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 535 tbl v8.16b, {v8.16b},v5.16b 536 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 537 tbl v9.16b, {v29.16b}, v11.16b 538 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 539 eor v8.16b, v8.16b, v12.16b 540 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 541 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 542 eor v8.16b, v8.16b, v9.16b 543 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 544 545 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 546 tbl v12.16b, {v30.16b}, v10.16b 547 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 548 tbl v8.16b, {v8.16b},v5.16b 549 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 550 tbl v9.16b, {v31.16b}, v11.16b 551 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 552 eor v8.16b, v8.16b, v12.16b 553 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 554 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 555 eor v8.16b, v8.16b, v9.16b 556 sub w8, w8, #1 // sub $1,%rax # nr-- 557 558.Ldec_2x_entry: 559 // top of round 560 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 561 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 562 and v9.16b, v8.16b, v17.16b 563 ushr v8.16b, v8.16b, #4 564 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 565 tbl v10.16b, {v19.16b},v9.16b 566 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 567 eor v9.16b, v9.16b, v8.16b 568 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 569 tbl v11.16b, {v18.16b},v8.16b 570 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 571 tbl v12.16b, {v18.16b},v9.16b 572 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 573 eor v11.16b, v11.16b, v10.16b 574 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 575 eor v12.16b, v12.16b, v10.16b 576 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 577 tbl v10.16b, {v18.16b},v11.16b 578 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 579 tbl v11.16b, {v18.16b},v12.16b 580 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 581 eor v10.16b, v10.16b, v9.16b 582 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 583 eor v11.16b, v11.16b, v8.16b 584 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 585 cbnz w8, .Ldec_2x_loop 586 587 // middle of last round 588 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 589 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 590 tbl v12.16b, {v22.16b}, v10.16b 591 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 592 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 593 tbl v9.16b, {v23.16b}, v11.16b 594 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 595 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 596 eor v12.16b, v12.16b, v16.16b 597 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 598 eor v8.16b, v9.16b, v12.16b 599 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 600 tbl v1.16b, {v8.16b},v2.16b 601 ret 602.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 603######################################################## 604## ## 605## AES key schedule ## 606## ## 607######################################################## 608.type _vpaes_key_preheat,%function 609.align 4 610_vpaes_key_preheat: 611 adrp x10, .Lk_inv 612 add x10, x10, :lo12:.Lk_inv 613 movi v16.16b, #0x5b // .Lk_s63 614 adrp x11, .Lk_sb1 615 add x11, x11, :lo12:.Lk_sb1 616 movi v17.16b, #0x0f // .Lk_s0F 617 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 618 adrp x10, .Lk_dksd 619 add x10, x10, :lo12:.Lk_dksd 620 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 621 adrp x11, .Lk_mc_forward 622 add x11, x11, :lo12:.Lk_mc_forward 623 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 624 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 625 ld1 {v8.2d}, [x10] // .Lk_rcon 626 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 627 ret 628.size _vpaes_key_preheat,.-_vpaes_key_preheat 629 630.type _vpaes_schedule_core,%function 631.align 4 632_vpaes_schedule_core: 633 stp x29, x30, [sp,#-16]! 634 add x29,sp,#0 635 636 bl _vpaes_key_preheat // load the tables 637 638 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 639 640 // input transform 641 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 642 bl _vpaes_schedule_transform 643 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 644 645 adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 646 add x10, x10, :lo12:.Lk_sr 647 648 add x8, x8, x10 649 cbnz w3, .Lschedule_am_decrypting 650 651 // encrypting, output zeroth round key after transform 652 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 653 b .Lschedule_go 654 655.Lschedule_am_decrypting: 656 // decrypting, output zeroth round key after shiftrows 657 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 658 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 659 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 660 eor x8, x8, #0x30 // xor $0x30, %r8 661 662.Lschedule_go: 663 cmp w1, #192 // cmp $192, %esi 664 b.hi .Lschedule_256 665 b.eq .Lschedule_192 666 // 128: fall though 667 668## 669## .schedule_128 670## 671## 128-bit specific part of key schedule. 672## 673## This schedule is really simple, because all its parts 674## are accomplished by the subroutines. 675## 676.Lschedule_128: 677 mov x0, #10 // mov $10, %esi 678 679.Loop_schedule_128: 680 sub x0, x0, #1 // dec %esi 681 bl _vpaes_schedule_round 682 cbz x0, .Lschedule_mangle_last 683 bl _vpaes_schedule_mangle // write output 684 b .Loop_schedule_128 685 686## 687## .aes_schedule_192 688## 689## 192-bit specific part of key schedule. 690## 691## The main body of this schedule is the same as the 128-bit 692## schedule, but with more smearing. The long, high side is 693## stored in %xmm7 as before, and the short, low side is in 694## the high bits of %xmm6. 695## 696## This schedule is somewhat nastier, however, because each 697## round produces 192 bits of key material, or 1.5 round keys. 698## Therefore, on each cycle we do 2 rounds and produce 3 round 699## keys. 700## 701.align 4 702.Lschedule_192: 703 sub x0, x0, #8 704 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 705 bl _vpaes_schedule_transform // input transform 706 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 707 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 708 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 709 mov x0, #4 // mov $4, %esi 710 711.Loop_schedule_192: 712 sub x0, x0, #1 // dec %esi 713 bl _vpaes_schedule_round 714 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 715 bl _vpaes_schedule_mangle // save key n 716 bl _vpaes_schedule_192_smear 717 bl _vpaes_schedule_mangle // save key n+1 718 bl _vpaes_schedule_round 719 cbz x0, .Lschedule_mangle_last 720 bl _vpaes_schedule_mangle // save key n+2 721 bl _vpaes_schedule_192_smear 722 b .Loop_schedule_192 723 724## 725## .aes_schedule_256 726## 727## 256-bit specific part of key schedule. 728## 729## The structure here is very similar to the 128-bit 730## schedule, but with an additional "low side" in 731## %xmm6. The low side's rounds are the same as the 732## high side's, except no rcon and no rotation. 733## 734.align 4 735.Lschedule_256: 736 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 737 bl _vpaes_schedule_transform // input transform 738 mov x0, #7 // mov $7, %esi 739 740.Loop_schedule_256: 741 sub x0, x0, #1 // dec %esi 742 bl _vpaes_schedule_mangle // output low result 743 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 744 745 // high round 746 bl _vpaes_schedule_round 747 cbz x0, .Lschedule_mangle_last 748 bl _vpaes_schedule_mangle 749 750 // low round. swap xmm7 and xmm6 751 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 752 movi v4.16b, #0 753 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 754 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 755 bl _vpaes_schedule_low_round 756 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 757 758 b .Loop_schedule_256 759 760## 761## .aes_schedule_mangle_last 762## 763## Mangler for last round of key schedule 764## Mangles %xmm0 765## when encrypting, outputs out(%xmm0) ^ 63 766## when decrypting, outputs unskew(%xmm0) 767## 768## Always called right before return... jumps to cleanup and exits 769## 770.align 4 771.Lschedule_mangle_last: 772 // schedule last round key from xmm0 773 adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 774 add x11, x11, :lo12:.Lk_deskew 775 776 cbnz w3, .Lschedule_mangle_last_dec 777 778 // encrypting 779 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 780 adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 781 add x11, x11, :lo12:.Lk_opt 782 add x2, x2, #32 // add $32, %rdx 783 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 784 785.Lschedule_mangle_last_dec: 786 ld1 {v20.2d,v21.2d}, [x11] // reload constants 787 sub x2, x2, #16 // add $-16, %rdx 788 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 789 bl _vpaes_schedule_transform // output transform 790 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 791 792 // cleanup 793 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 794 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 795 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 796 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 797 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 798 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 799 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 800 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 801 ldp x29, x30, [sp],#16 802 ret 803.size _vpaes_schedule_core,.-_vpaes_schedule_core 804 805## 806## .aes_schedule_192_smear 807## 808## Smear the short, low side in the 192-bit key schedule. 809## 810## Inputs: 811## %xmm7: high side, b a x y 812## %xmm6: low side, d c 0 0 813## %xmm13: 0 814## 815## Outputs: 816## %xmm6: b+c+d b+c 0 0 817## %xmm0: b+c+d b+c b a 818## 819.type _vpaes_schedule_192_smear,%function 820.align 4 821_vpaes_schedule_192_smear: 822 movi v1.16b, #0 823 dup v0.4s, v7.s[3] 824 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 825 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 826 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 827 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 828 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 829 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 830 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 831 ret 832.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 833 834## 835## .aes_schedule_round 836## 837## Runs one main round of the key schedule on %xmm0, %xmm7 838## 839## Specifically, runs subbytes on the high dword of %xmm0 840## then rotates it by one byte and xors into the low dword of 841## %xmm7. 842## 843## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 844## next rcon. 845## 846## Smears the dwords of %xmm7 by xoring the low into the 847## second low, result into third, result into highest. 848## 849## Returns results in %xmm7 = %xmm0. 850## Clobbers %xmm1-%xmm4, %r11. 851## 852.type _vpaes_schedule_round,%function 853.align 4 854_vpaes_schedule_round: 855 // extract rcon from xmm8 856 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 857 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 858 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 859 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 860 861 // rotate 862 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 863 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 864 865 // fall through... 866 867 // low round: same as high round, but no rotation and no rcon. 868_vpaes_schedule_low_round: 869 // smear xmm7 870 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 871 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 872 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 873 874 // subbytes 875 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 876 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 877 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 878 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 879 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 880 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 881 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 882 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 883 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 884 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 885 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 886 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 887 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 888 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 889 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 890 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 891 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 892 893 // add in smeared stuff 894 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 895 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 896 ret 897.size _vpaes_schedule_round,.-_vpaes_schedule_round 898 899## 900## .aes_schedule_transform 901## 902## Linear-transform %xmm0 according to tables at (%r11) 903## 904## Requires that %xmm9 = 0x0F0F... as in preheat 905## Output in %xmm0 906## Clobbers %xmm1, %xmm2 907## 908.type _vpaes_schedule_transform,%function 909.align 4 910_vpaes_schedule_transform: 911 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 912 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 913 // vmovdqa (%r11), %xmm2 # lo 914 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 915 // vmovdqa 16(%r11), %xmm1 # hi 916 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 917 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 918 ret 919.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 920 921## 922## .aes_schedule_mangle 923## 924## Mangle xmm0 from (basis-transformed) standard version 925## to our version. 926## 927## On encrypt, 928## xor with 0x63 929## multiply by circulant 0,1,1,1 930## apply shiftrows transform 931## 932## On decrypt, 933## xor with 0x63 934## multiply by "inverse mixcolumns" circulant E,B,D,9 935## deskew 936## apply shiftrows transform 937## 938## 939## Writes out to (%rdx), and increments or decrements it 940## Keeps track of round number mod 4 in %r8 941## Preserves xmm0 942## Clobbers xmm1-xmm5 943## 944.type _vpaes_schedule_mangle,%function 945.align 4 946_vpaes_schedule_mangle: 947 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 948 // vmovdqa .Lk_mc_forward(%rip),%xmm5 949 cbnz w3, .Lschedule_mangle_dec 950 951 // encrypting 952 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 953 add x2, x2, #16 // add $16, %rdx 954 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 955 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 956 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 957 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 958 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 959 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 960 961 b .Lschedule_mangle_both 962.align 4 963.Lschedule_mangle_dec: 964 // inverse mix columns 965 // lea .Lk_dksd(%rip),%r11 966 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 967 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 968 969 // vmovdqa 0x00(%r11), %xmm2 970 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 971 // vmovdqa 0x10(%r11), %xmm3 972 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 973 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 974 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 975 976 // vmovdqa 0x20(%r11), %xmm2 977 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 978 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 979 // vmovdqa 0x30(%r11), %xmm3 980 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 981 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 982 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 983 984 // vmovdqa 0x40(%r11), %xmm2 985 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 986 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 987 // vmovdqa 0x50(%r11), %xmm3 988 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 989 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 990 991 // vmovdqa 0x60(%r11), %xmm2 992 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 993 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 994 // vmovdqa 0x70(%r11), %xmm4 995 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 996 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 997 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 998 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 999 1000 sub x2, x2, #16 // add $-16, %rdx 1001 1002.Lschedule_mangle_both: 1003 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1004 add x8, x8, #64-16 // add $-16, %r8 1005 and x8, x8, #~(1<<6) // and $0x30, %r8 1006 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 1007 ret 1008.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1009 1010.globl vpaes_set_encrypt_key 1011.hidden vpaes_set_encrypt_key 1012.type vpaes_set_encrypt_key,%function 1013.align 4 1014vpaes_set_encrypt_key: 1015 stp x29,x30,[sp,#-16]! 1016 add x29,sp,#0 1017 stp d8,d9,[sp,#-16]! // ABI spec says so 1018 1019 lsr w9, w1, #5 // shr $5,%eax 1020 add w9, w9, #5 // $5,%eax 1021 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1022 1023 mov w3, #0 // mov $0,%ecx 1024 mov x8, #0x30 // mov $0x30,%r8d 1025 bl _vpaes_schedule_core 1026 eor x0, x0, x0 1027 1028 ldp d8,d9,[sp],#16 1029 ldp x29,x30,[sp],#16 1030 ret 1031.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1032 1033.globl vpaes_set_decrypt_key 1034.hidden vpaes_set_decrypt_key 1035.type vpaes_set_decrypt_key,%function 1036.align 4 1037vpaes_set_decrypt_key: 1038 stp x29,x30,[sp,#-16]! 1039 add x29,sp,#0 1040 stp d8,d9,[sp,#-16]! // ABI spec says so 1041 1042 lsr w9, w1, #5 // shr $5,%eax 1043 add w9, w9, #5 // $5,%eax 1044 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1045 lsl w9, w9, #4 // shl $4,%eax 1046 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1047 add x2, x2, x9 1048 1049 mov w3, #1 // mov $1,%ecx 1050 lsr w8, w1, #1 // shr $1,%r8d 1051 and x8, x8, #32 // and $32,%r8d 1052 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1053 bl _vpaes_schedule_core 1054 1055 ldp d8,d9,[sp],#16 1056 ldp x29,x30,[sp],#16 1057 ret 1058.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1059.globl vpaes_cbc_encrypt 1060.hidden vpaes_cbc_encrypt 1061.type vpaes_cbc_encrypt,%function 1062.align 4 1063vpaes_cbc_encrypt: 1064 cbz x2, .Lcbc_abort 1065 cmp w5, #0 // check direction 1066 b.eq vpaes_cbc_decrypt 1067 1068 stp x29,x30,[sp,#-16]! 1069 add x29,sp,#0 1070 1071 mov x17, x2 // reassign 1072 mov x2, x3 // reassign 1073 1074 ld1 {v0.16b}, [x4] // load ivec 1075 bl _vpaes_encrypt_preheat 1076 b .Lcbc_enc_loop 1077 1078.align 4 1079.Lcbc_enc_loop: 1080 ld1 {v7.16b}, [x0],#16 // load input 1081 eor v7.16b, v7.16b, v0.16b // xor with ivec 1082 bl _vpaes_encrypt_core 1083 st1 {v0.16b}, [x1],#16 // save output 1084 subs x17, x17, #16 1085 b.hi .Lcbc_enc_loop 1086 1087 st1 {v0.16b}, [x4] // write ivec 1088 1089 ldp x29,x30,[sp],#16 1090.Lcbc_abort: 1091 ret 1092.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1093 1094.type vpaes_cbc_decrypt,%function 1095.align 4 1096vpaes_cbc_decrypt: 1097 stp x29,x30,[sp,#-16]! 1098 add x29,sp,#0 1099 stp d8,d9,[sp,#-16]! // ABI spec says so 1100 stp d10,d11,[sp,#-16]! 1101 stp d12,d13,[sp,#-16]! 1102 stp d14,d15,[sp,#-16]! 1103 1104 mov x17, x2 // reassign 1105 mov x2, x3 // reassign 1106 ld1 {v6.16b}, [x4] // load ivec 1107 bl _vpaes_decrypt_preheat 1108 tst x17, #16 1109 b.eq .Lcbc_dec_loop2x 1110 1111 ld1 {v7.16b}, [x0], #16 // load input 1112 bl _vpaes_decrypt_core 1113 eor v0.16b, v0.16b, v6.16b // xor with ivec 1114 orr v6.16b, v7.16b, v7.16b // next ivec value 1115 st1 {v0.16b}, [x1], #16 1116 subs x17, x17, #16 1117 b.ls .Lcbc_dec_done 1118 1119.align 4 1120.Lcbc_dec_loop2x: 1121 ld1 {v14.16b,v15.16b}, [x0], #32 1122 bl _vpaes_decrypt_2x 1123 eor v0.16b, v0.16b, v6.16b // xor with ivec 1124 eor v1.16b, v1.16b, v14.16b 1125 orr v6.16b, v15.16b, v15.16b 1126 st1 {v0.16b,v1.16b}, [x1], #32 1127 subs x17, x17, #32 1128 b.hi .Lcbc_dec_loop2x 1129 1130.Lcbc_dec_done: 1131 st1 {v6.16b}, [x4] 1132 1133 ldp d14,d15,[sp],#16 1134 ldp d12,d13,[sp],#16 1135 ldp d10,d11,[sp],#16 1136 ldp d8,d9,[sp],#16 1137 ldp x29,x30,[sp],#16 1138 ret 1139.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1140.globl vpaes_ctr32_encrypt_blocks 1141.hidden vpaes_ctr32_encrypt_blocks 1142.type vpaes_ctr32_encrypt_blocks,%function 1143.align 4 1144vpaes_ctr32_encrypt_blocks: 1145 stp x29,x30,[sp,#-16]! 1146 add x29,sp,#0 1147 stp d8,d9,[sp,#-16]! // ABI spec says so 1148 stp d10,d11,[sp,#-16]! 1149 stp d12,d13,[sp,#-16]! 1150 stp d14,d15,[sp,#-16]! 1151 1152 cbz x2, .Lctr32_done 1153 1154 // Note, unlike the other functions, x2 here is measured in blocks, 1155 // not bytes. 1156 mov x17, x2 1157 mov x2, x3 1158 1159 // Load the IV and counter portion. 1160 ldr w6, [x4, #12] 1161 ld1 {v7.16b}, [x4] 1162 1163 bl _vpaes_encrypt_preheat 1164 tst x17, #1 1165 rev w6, w6 // The counter is big-endian. 1166 b.eq .Lctr32_prep_loop 1167 1168 // Handle one block so the remaining block count is even for 1169 // _vpaes_encrypt_2x. 1170 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time 1171 bl _vpaes_encrypt_core 1172 eor v0.16b, v0.16b, v6.16b // XOR input and result 1173 st1 {v0.16b}, [x1], #16 1174 subs x17, x17, #1 1175 // Update the counter. 1176 add w6, w6, #1 1177 rev w7, w6 1178 mov v7.s[3], w7 1179 b.ls .Lctr32_done 1180 1181.Lctr32_prep_loop: 1182 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 1183 // uses v14 and v15. 1184 mov v15.16b, v7.16b 1185 mov v14.16b, v7.16b 1186 add w6, w6, #1 1187 rev w7, w6 1188 mov v15.s[3], w7 1189 1190.Lctr32_loop: 1191 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time 1192 bl _vpaes_encrypt_2x 1193 eor v0.16b, v0.16b, v6.16b // XOR input and result 1194 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 1195 st1 {v0.16b,v1.16b}, [x1], #32 1196 subs x17, x17, #2 1197 // Update the counter. 1198 add w7, w6, #1 1199 add w6, w6, #2 1200 rev w7, w7 1201 mov v14.s[3], w7 1202 rev w7, w6 1203 mov v15.s[3], w7 1204 b.hi .Lctr32_loop 1205 1206.Lctr32_done: 1207 ldp d14,d15,[sp],#16 1208 ldp d12,d13,[sp],#16 1209 ldp d10,d11,[sp],#16 1210 ldp d8,d9,[sp],#16 1211 ldp x29,x30,[sp],#16 1212 ret 1213.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks 1214#endif 1215#endif // !OPENSSL_NO_ASM 1216.section .note.GNU-stack,"",%progbits 1217