1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include "ring_core_generated/prefix_symbols_asm.h" 14#include <ring-core/arm_arch.h> 15 16.section .rodata 17 18.type _vpaes_consts,%object 19.align 7 // totally strategic alignment 20_vpaes_consts: 21.Lk_mc_forward: // mc_forward 22.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 23.quad 0x080B0A0904070605, 0x000302010C0F0E0D 24.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 25.quad 0x000302010C0F0E0D, 0x080B0A0904070605 26.Lk_mc_backward: // mc_backward 27.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 28.quad 0x020100030E0D0C0F, 0x0A09080B06050407 29.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 30.quad 0x0A09080B06050407, 0x020100030E0D0C0F 31.Lk_sr: // sr 32.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 33.quad 0x030E09040F0A0500, 0x0B06010C07020D08 34.quad 0x0F060D040B020900, 0x070E050C030A0108 35.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 36 37// 38// "Hot" constants 39// 40.Lk_inv: // inv, inva 41.quad 0x0E05060F0D080180, 0x040703090A0B0C02 42.quad 0x01040A060F0B0780, 0x030D0E0C02050809 43.Lk_ipt: // input transform (lo, hi) 44.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 45.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 46.Lk_sbo: // sbou, sbot 47.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 48.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 49.Lk_sb1: // sb1u, sb1t 50.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 51.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 52.Lk_sb2: // sb2u, sb2t 53.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 54.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 55 56// 57// Key schedule constants 58// 59.Lk_dksd: // decryption key schedule: invskew x*D 60.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 61.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 62.Lk_dksb: // decryption key schedule: invskew x*B 63.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 64.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 65.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 66.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 67.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 68.Lk_dks9: // decryption key schedule: invskew x*9 69.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 70.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 71 72.Lk_rcon: // rcon 73.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 74 75.Lk_opt: // output transform 76.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 77.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 78.Lk_deskew: // deskew tables: inverts the sbox's "skew" 79.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 80.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 81 82.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 83.align 2 84.size _vpaes_consts,.-_vpaes_consts 85.align 6 86 87.text 88## 89## _aes_preheat 90## 91## Fills register %r10 -> .aes_consts (so you can -fPIC) 92## and %xmm9-%xmm15 as specified below. 93## 94.type _vpaes_encrypt_preheat,%function 95.align 4 96_vpaes_encrypt_preheat: 97 adrp x10, .Lk_inv 98 add x10, x10, :lo12:.Lk_inv 99 movi v17.16b, #0x0f 100 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 101 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 102 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 103 ret 104.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 105 106## 107## _aes_encrypt_core 108## 109## AES-encrypt %xmm0. 110## 111## Inputs: 112## %xmm0 = input 113## %xmm9-%xmm15 as in _vpaes_preheat 114## (%rdx) = scheduled keys 115## 116## Output in %xmm0 117## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 118## Preserves %xmm6 - %xmm8 so you get some local vectors 119## 120## 121.type _vpaes_encrypt_core,%function 122.align 4 123_vpaes_encrypt_core: 124 mov x9, x2 125 ldr w8, [x2,#240] // pull rounds 126 adrp x11, .Lk_mc_forward+16 127 add x11, x11, :lo12:.Lk_mc_forward+16 128 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 129 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 130 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 131 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 132 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 133 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 134 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 135 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 136 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 137 b .Lenc_entry 138 139.align 4 140.Lenc_loop: 141 // middle of middle round 142 add x10, x11, #0x40 143 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 144 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 145 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 146 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 147 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 148 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 149 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 150 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 151 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 152 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 153 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 154 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 155 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 156 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 157 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 158 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 159 sub w8, w8, #1 // nr-- 160 161.Lenc_entry: 162 // top of round 163 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 164 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 165 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 166 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 167 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 168 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 169 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 170 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 171 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 172 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 173 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 174 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 175 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 176 cbnz w8, .Lenc_loop 177 178 // middle of last round 179 add x10, x11, #0x80 180 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 181 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 182 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 183 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 184 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 185 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 186 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 187 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 188 ret 189.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 190 191.globl vpaes_encrypt 192.hidden vpaes_encrypt 193.type vpaes_encrypt,%function 194.align 4 195vpaes_encrypt: 196 AARCH64_SIGN_LINK_REGISTER 197 stp x29,x30,[sp,#-16]! 198 add x29,sp,#0 199 200 ld1 {v7.16b}, [x0] 201 bl _vpaes_encrypt_preheat 202 bl _vpaes_encrypt_core 203 st1 {v0.16b}, [x1] 204 205 ldp x29,x30,[sp],#16 206 AARCH64_VALIDATE_LINK_REGISTER 207 ret 208.size vpaes_encrypt,.-vpaes_encrypt 209 210.type _vpaes_encrypt_2x,%function 211.align 4 212_vpaes_encrypt_2x: 213 mov x9, x2 214 ldr w8, [x2,#240] // pull rounds 215 adrp x11, .Lk_mc_forward+16 216 add x11, x11, :lo12:.Lk_mc_forward+16 217 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 218 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 219 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 220 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 221 and v9.16b, v15.16b, v17.16b 222 ushr v8.16b, v15.16b, #4 223 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 224 tbl v9.16b, {v20.16b}, v9.16b 225 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 226 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 227 tbl v10.16b, {v21.16b}, v8.16b 228 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 229 eor v8.16b, v9.16b, v16.16b 230 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 231 eor v8.16b, v8.16b, v10.16b 232 b .Lenc_2x_entry 233 234.align 4 235.Lenc_2x_loop: 236 // middle of middle round 237 add x10, x11, #0x40 238 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 239 tbl v12.16b, {v25.16b}, v10.16b 240 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 241 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 242 tbl v8.16b, {v24.16b}, v11.16b 243 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 244 eor v12.16b, v12.16b, v16.16b 245 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 246 tbl v13.16b, {v27.16b}, v10.16b 247 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 248 eor v8.16b, v8.16b, v12.16b 249 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 250 tbl v10.16b, {v26.16b}, v11.16b 251 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 252 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 253 tbl v11.16b, {v8.16b}, v1.16b 254 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 255 eor v10.16b, v10.16b, v13.16b 256 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 257 tbl v8.16b, {v8.16b}, v4.16b 258 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 259 eor v11.16b, v11.16b, v10.16b 260 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 261 tbl v12.16b, {v11.16b},v1.16b 262 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 263 eor v8.16b, v8.16b, v11.16b 264 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 265 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 266 eor v8.16b, v8.16b, v12.16b 267 sub w8, w8, #1 // nr-- 268 269.Lenc_2x_entry: 270 // top of round 271 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 272 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 273 and v9.16b, v8.16b, v17.16b 274 ushr v8.16b, v8.16b, #4 275 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 276 tbl v13.16b, {v19.16b},v9.16b 277 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 278 eor v9.16b, v9.16b, v8.16b 279 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 280 tbl v11.16b, {v18.16b},v8.16b 281 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 282 tbl v12.16b, {v18.16b},v9.16b 283 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 284 eor v11.16b, v11.16b, v13.16b 285 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 286 eor v12.16b, v12.16b, v13.16b 287 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 288 tbl v10.16b, {v18.16b},v11.16b 289 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 290 tbl v11.16b, {v18.16b},v12.16b 291 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 292 eor v10.16b, v10.16b, v9.16b 293 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 294 eor v11.16b, v11.16b, v8.16b 295 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 296 cbnz w8, .Lenc_2x_loop 297 298 // middle of last round 299 add x10, x11, #0x80 300 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 301 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 302 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 303 tbl v12.16b, {v22.16b}, v10.16b 304 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 305 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 306 tbl v8.16b, {v23.16b}, v11.16b 307 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 308 eor v12.16b, v12.16b, v16.16b 309 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 310 eor v8.16b, v8.16b, v12.16b 311 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 312 tbl v1.16b, {v8.16b},v1.16b 313 ret 314.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 315######################################################## 316## ## 317## AES key schedule ## 318## ## 319######################################################## 320.type _vpaes_key_preheat,%function 321.align 4 322_vpaes_key_preheat: 323 adrp x10, .Lk_inv 324 add x10, x10, :lo12:.Lk_inv 325 movi v16.16b, #0x5b // .Lk_s63 326 adrp x11, .Lk_sb1 327 add x11, x11, :lo12:.Lk_sb1 328 movi v17.16b, #0x0f // .Lk_s0F 329 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 330 adrp x10, .Lk_dksd 331 add x10, x10, :lo12:.Lk_dksd 332 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 333 adrp x11, .Lk_mc_forward 334 add x11, x11, :lo12:.Lk_mc_forward 335 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 336 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 337 ld1 {v8.2d}, [x10] // .Lk_rcon 338 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 339 ret 340.size _vpaes_key_preheat,.-_vpaes_key_preheat 341 342.type _vpaes_schedule_core,%function 343.align 4 344_vpaes_schedule_core: 345 AARCH64_SIGN_LINK_REGISTER 346 stp x29, x30, [sp,#-16]! 347 add x29,sp,#0 348 349 bl _vpaes_key_preheat // load the tables 350 351 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 352 353 // input transform 354 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 355 bl _vpaes_schedule_transform 356 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 357 358 adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 359 add x10, x10, :lo12:.Lk_sr 360 361 add x8, x8, x10 362 363 // encrypting, output zeroth round key after transform 364 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 365 366 cmp w1, #192 // cmp $192, %esi 367 b.hi .Lschedule_256 368 b.eq .Lschedule_192 369 // 128: fall though 370 371## 372## .schedule_128 373## 374## 128-bit specific part of key schedule. 375## 376## This schedule is really simple, because all its parts 377## are accomplished by the subroutines. 378## 379.Lschedule_128: 380 mov x0, #10 // mov $10, %esi 381 382.Loop_schedule_128: 383 sub x0, x0, #1 // dec %esi 384 bl _vpaes_schedule_round 385 cbz x0, .Lschedule_mangle_last 386 bl _vpaes_schedule_mangle // write output 387 b .Loop_schedule_128 388 389## 390## .aes_schedule_192 391## 392## 192-bit specific part of key schedule. 393## 394## The main body of this schedule is the same as the 128-bit 395## schedule, but with more smearing. The long, high side is 396## stored in %xmm7 as before, and the short, low side is in 397## the high bits of %xmm6. 398## 399## This schedule is somewhat nastier, however, because each 400## round produces 192 bits of key material, or 1.5 round keys. 401## Therefore, on each cycle we do 2 rounds and produce 3 round 402## keys. 403## 404.align 4 405.Lschedule_192: 406 sub x0, x0, #8 407 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 408 bl _vpaes_schedule_transform // input transform 409 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 410 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 411 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 412 mov x0, #4 // mov $4, %esi 413 414.Loop_schedule_192: 415 sub x0, x0, #1 // dec %esi 416 bl _vpaes_schedule_round 417 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 418 bl _vpaes_schedule_mangle // save key n 419 bl _vpaes_schedule_192_smear 420 bl _vpaes_schedule_mangle // save key n+1 421 bl _vpaes_schedule_round 422 cbz x0, .Lschedule_mangle_last 423 bl _vpaes_schedule_mangle // save key n+2 424 bl _vpaes_schedule_192_smear 425 b .Loop_schedule_192 426 427## 428## .aes_schedule_256 429## 430## 256-bit specific part of key schedule. 431## 432## The structure here is very similar to the 128-bit 433## schedule, but with an additional "low side" in 434## %xmm6. The low side's rounds are the same as the 435## high side's, except no rcon and no rotation. 436## 437.align 4 438.Lschedule_256: 439 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 440 bl _vpaes_schedule_transform // input transform 441 mov x0, #7 // mov $7, %esi 442 443.Loop_schedule_256: 444 sub x0, x0, #1 // dec %esi 445 bl _vpaes_schedule_mangle // output low result 446 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 447 448 // high round 449 bl _vpaes_schedule_round 450 cbz x0, .Lschedule_mangle_last 451 bl _vpaes_schedule_mangle 452 453 // low round. swap xmm7 and xmm6 454 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 455 movi v4.16b, #0 456 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 457 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 458 bl _vpaes_schedule_low_round 459 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 460 461 b .Loop_schedule_256 462 463## 464## .aes_schedule_mangle_last 465## 466## Mangler for last round of key schedule 467## Mangles %xmm0 468## when encrypting, outputs out(%xmm0) ^ 63 469## when decrypting, outputs unskew(%xmm0) 470## 471## Always called right before return... jumps to cleanup and exits 472## 473.align 4 474.Lschedule_mangle_last: 475 // schedule last round key from xmm0 476 adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 477 add x11, x11, :lo12:.Lk_deskew 478 479 cbnz w3, .Lschedule_mangle_last_dec 480 481 // encrypting 482 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 483 adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 484 add x11, x11, :lo12:.Lk_opt 485 add x2, x2, #32 // add $32, %rdx 486 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 487 488.Lschedule_mangle_last_dec: 489 ld1 {v20.2d,v21.2d}, [x11] // reload constants 490 sub x2, x2, #16 // add $-16, %rdx 491 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 492 bl _vpaes_schedule_transform // output transform 493 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 494 495 // cleanup 496 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 497 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 498 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 499 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 500 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 501 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 502 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 503 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 504 ldp x29, x30, [sp],#16 505 AARCH64_VALIDATE_LINK_REGISTER 506 ret 507.size _vpaes_schedule_core,.-_vpaes_schedule_core 508 509## 510## .aes_schedule_192_smear 511## 512## Smear the short, low side in the 192-bit key schedule. 513## 514## Inputs: 515## %xmm7: high side, b a x y 516## %xmm6: low side, d c 0 0 517## %xmm13: 0 518## 519## Outputs: 520## %xmm6: b+c+d b+c 0 0 521## %xmm0: b+c+d b+c b a 522## 523.type _vpaes_schedule_192_smear,%function 524.align 4 525_vpaes_schedule_192_smear: 526 movi v1.16b, #0 527 dup v0.4s, v7.s[3] 528 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 529 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 530 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 531 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 532 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 533 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 534 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 535 ret 536.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 537 538## 539## .aes_schedule_round 540## 541## Runs one main round of the key schedule on %xmm0, %xmm7 542## 543## Specifically, runs subbytes on the high dword of %xmm0 544## then rotates it by one byte and xors into the low dword of 545## %xmm7. 546## 547## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 548## next rcon. 549## 550## Smears the dwords of %xmm7 by xoring the low into the 551## second low, result into third, result into highest. 552## 553## Returns results in %xmm7 = %xmm0. 554## Clobbers %xmm1-%xmm4, %r11. 555## 556.type _vpaes_schedule_round,%function 557.align 4 558_vpaes_schedule_round: 559 // extract rcon from xmm8 560 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 561 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 562 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 563 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 564 565 // rotate 566 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 567 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 568 569 // fall through... 570 571 // low round: same as high round, but no rotation and no rcon. 572_vpaes_schedule_low_round: 573 // smear xmm7 574 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 575 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 576 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 577 578 // subbytes 579 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 580 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 581 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 582 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 583 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 584 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 585 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 586 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 587 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 588 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 589 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 590 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 591 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 592 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 593 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 594 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 595 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 596 597 // add in smeared stuff 598 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 599 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 600 ret 601.size _vpaes_schedule_round,.-_vpaes_schedule_round 602 603## 604## .aes_schedule_transform 605## 606## Linear-transform %xmm0 according to tables at (%r11) 607## 608## Requires that %xmm9 = 0x0F0F... as in preheat 609## Output in %xmm0 610## Clobbers %xmm1, %xmm2 611## 612.type _vpaes_schedule_transform,%function 613.align 4 614_vpaes_schedule_transform: 615 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 616 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 617 // vmovdqa (%r11), %xmm2 # lo 618 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 619 // vmovdqa 16(%r11), %xmm1 # hi 620 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 621 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 622 ret 623.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 624 625## 626## .aes_schedule_mangle 627## 628## Mangle xmm0 from (basis-transformed) standard version 629## to our version. 630## 631## On encrypt, 632## xor with 0x63 633## multiply by circulant 0,1,1,1 634## apply shiftrows transform 635## 636## On decrypt, 637## xor with 0x63 638## multiply by "inverse mixcolumns" circulant E,B,D,9 639## deskew 640## apply shiftrows transform 641## 642## 643## Writes out to (%rdx), and increments or decrements it 644## Keeps track of round number mod 4 in %r8 645## Preserves xmm0 646## Clobbers xmm1-xmm5 647## 648.type _vpaes_schedule_mangle,%function 649.align 4 650_vpaes_schedule_mangle: 651 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 652 // vmovdqa .Lk_mc_forward(%rip),%xmm5 653 654 // encrypting 655 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 656 add x2, x2, #16 // add $16, %rdx 657 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 658 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 659 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 660 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 661 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 662 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 663 664.Lschedule_mangle_both: 665 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 666 add x8, x8, #48 // add $-16, %r8 667 and x8, x8, #~(1<<6) // and $0x30, %r8 668 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 669 ret 670.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 671 672.globl vpaes_set_encrypt_key 673.hidden vpaes_set_encrypt_key 674.type vpaes_set_encrypt_key,%function 675.align 4 676vpaes_set_encrypt_key: 677 AARCH64_SIGN_LINK_REGISTER 678 stp x29,x30,[sp,#-16]! 679 add x29,sp,#0 680 stp d8,d9,[sp,#-16]! // ABI spec says so 681 682 lsr w9, w1, #5 // shr $5,%eax 683 add w9, w9, #5 // $5,%eax 684 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 685 686 mov w3, #0 // mov $0,%ecx 687 mov x8, #0x30 // mov $0x30,%r8d 688 bl _vpaes_schedule_core 689 eor x0, x0, x0 690 691 ldp d8,d9,[sp],#16 692 ldp x29,x30,[sp],#16 693 AARCH64_VALIDATE_LINK_REGISTER 694 ret 695.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 696.globl vpaes_ctr32_encrypt_blocks 697.hidden vpaes_ctr32_encrypt_blocks 698.type vpaes_ctr32_encrypt_blocks,%function 699.align 4 700vpaes_ctr32_encrypt_blocks: 701 AARCH64_SIGN_LINK_REGISTER 702 stp x29,x30,[sp,#-16]! 703 add x29,sp,#0 704 stp d8,d9,[sp,#-16]! // ABI spec says so 705 stp d10,d11,[sp,#-16]! 706 stp d12,d13,[sp,#-16]! 707 stp d14,d15,[sp,#-16]! 708 709 cbz x2, .Lctr32_done 710 711 // Note, unlike the other functions, x2 here is measured in blocks, 712 // not bytes. 713 mov x17, x2 714 mov x2, x3 715 716 // Load the IV and counter portion. 717 ldr w6, [x4, #12] 718 ld1 {v7.16b}, [x4] 719 720 bl _vpaes_encrypt_preheat 721 tst x17, #1 722 rev w6, w6 // The counter is big-endian. 723 b.eq .Lctr32_prep_loop 724 725 // Handle one block so the remaining block count is even for 726 // _vpaes_encrypt_2x. 727 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time 728 bl _vpaes_encrypt_core 729 eor v0.16b, v0.16b, v6.16b // XOR input and result 730 st1 {v0.16b}, [x1], #16 731 subs x17, x17, #1 732 // Update the counter. 733 add w6, w6, #1 734 rev w7, w6 735 mov v7.s[3], w7 736 b.ls .Lctr32_done 737 738.Lctr32_prep_loop: 739 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 740 // uses v14 and v15. 741 mov v15.16b, v7.16b 742 mov v14.16b, v7.16b 743 add w6, w6, #1 744 rev w7, w6 745 mov v15.s[3], w7 746 747.Lctr32_loop: 748 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time 749 bl _vpaes_encrypt_2x 750 eor v0.16b, v0.16b, v6.16b // XOR input and result 751 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 752 st1 {v0.16b,v1.16b}, [x1], #32 753 subs x17, x17, #2 754 // Update the counter. 755 add w7, w6, #1 756 add w6, w6, #2 757 rev w7, w7 758 mov v14.s[3], w7 759 rev w7, w6 760 mov v15.s[3], w7 761 b.hi .Lctr32_loop 762 763.Lctr32_done: 764 ldp d14,d15,[sp],#16 765 ldp d12,d13,[sp],#16 766 ldp d10,d11,[sp],#16 767 ldp d8,d9,[sp],#16 768 ldp x29,x30,[sp],#16 769 AARCH64_VALIDATE_LINK_REGISTER 770 ret 771.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks 772#endif 773#endif // !OPENSSL_NO_ASM 774.section .note.GNU-stack,"",%progbits 775