1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__arm__) 13.syntax unified 14 15.arch armv7-a 16.fpu neon 17 18#if defined(__thumb2__) 19.thumb 20#else 21.code 32 22#endif 23 24.text 25 26.type _vpaes_consts,%object 27.align 7 @ totally strategic alignment 28_vpaes_consts: 29.Lk_mc_forward:@ mc_forward 30.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 31.quad 0x080B0A0904070605, 0x000302010C0F0E0D 32.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 33.quad 0x000302010C0F0E0D, 0x080B0A0904070605 34.Lk_mc_backward:@ mc_backward 35.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 36.quad 0x020100030E0D0C0F, 0x0A09080B06050407 37.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 38.quad 0x0A09080B06050407, 0x020100030E0D0C0F 39.Lk_sr:@ sr 40.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 41.quad 0x030E09040F0A0500, 0x0B06010C07020D08 42.quad 0x0F060D040B020900, 0x070E050C030A0108 43.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 44 45@ 46@ "Hot" constants 47@ 48.Lk_inv:@ inv, inva 49.quad 0x0E05060F0D080180, 0x040703090A0B0C02 50.quad 0x01040A060F0B0780, 0x030D0E0C02050809 51.Lk_ipt:@ input transform (lo, hi) 52.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 53.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 54.Lk_sbo:@ sbou, sbot 55.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 56.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 57.Lk_sb1:@ sb1u, sb1t 58.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 59.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 60.Lk_sb2:@ sb2u, sb2t 61.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 62.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 63 64.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 65.align 2 66.size _vpaes_consts,.-_vpaes_consts 67.align 6 68@@ 69@@ _aes_preheat 70@@ 71@@ Fills q9-q15 as specified below. 72@@ 73.type _vpaes_preheat,%function 74.align 4 75_vpaes_preheat: 76 adr r10, .Lk_inv 77 vmov.i8 q9, #0x0f @ .Lk_s0F 78 vld1.64 {q10,q11}, [r10]! @ .Lk_inv 79 add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo 80 vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 81 vld1.64 {q14,q15}, [r10] @ .Lk_sb2 82 bx lr 83 84@@ 85@@ _aes_encrypt_core 86@@ 87@@ AES-encrypt q0. 88@@ 89@@ Inputs: 90@@ q0 = input 91@@ q9-q15 as in _vpaes_preheat 92@@ [r2] = scheduled keys 93@@ 94@@ Output in q0 95@@ Clobbers q1-q5, r8-r11 96@@ Preserves q6-q8 so you get some local vectors 97@@ 98@@ 99.type _vpaes_encrypt_core,%function 100.align 4 101_vpaes_encrypt_core: 102 mov r9, r2 103 ldr r8, [r2,#240] @ pull rounds 104 adr r11, .Lk_ipt 105 @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 106 @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 107 vld1.64 {q2, q3}, [r11] 108 adr r11, .Lk_mc_forward+16 109 vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key 110 vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 111 vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 112 vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 113 vtbl.8 d3, {q2}, d3 114 vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 115 vtbl.8 d5, {q3}, d1 116 veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 117 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 118 119 @ .Lenc_entry ends with a bnz instruction which is normally paired with 120 @ subs in .Lenc_loop. 121 tst r8, r8 122 b .Lenc_entry 123 124.align 4 125.Lenc_loop: 126 @ middle of middle round 127 add r10, r11, #0x40 128 vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 129 vtbl.8 d9, {q13}, d5 130 vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 131 vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 132 vtbl.8 d1, {q12}, d7 133 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 134 vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 135 vtbl.8 d11, {q15}, d5 136 veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A 137 vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 138 vtbl.8 d5, {q14}, d7 139 vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 140 vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 141 vtbl.8 d7, {q0}, d3 142 veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 143 @ Write to q5 instead of q0, so the table and destination registers do 144 @ not overlap. 145 vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 146 vtbl.8 d11, {q0}, d9 147 veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 148 vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 149 vtbl.8 d9, {q3}, d3 150 @ Here we restore the original q0/q5 usage. 151 veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 152 and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 153 veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 154 subs r8, r8, #1 @ nr-- 155 156.Lenc_entry: 157 @ top of round 158 vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k 159 vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i 160 vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 161 vtbl.8 d11, {q11}, d3 162 veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j 163 vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 164 vtbl.8 d7, {q10}, d1 165 vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 166 vtbl.8 d9, {q10}, d3 167 veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 168 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 169 vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 170 vtbl.8 d5, {q10}, d7 171 vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 172 vtbl.8 d7, {q10}, d9 173 veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io 174 veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 175 vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 176 bne .Lenc_loop 177 178 @ middle of last round 179 add r10, r11, #0x80 180 181 adr r11, .Lk_sbo 182 @ Read to q1 instead of q4, so the vtbl.8 instruction below does not 183 @ overlap table and destination registers. 184 vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou 185 vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 186 vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 187 vtbl.8 d9, {q1}, d5 188 vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 189 @ Write to q2 instead of q0 below, to avoid overlapping table and 190 @ destination registers. 191 vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 192 vtbl.8 d5, {q0}, d7 193 veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 194 veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A 195 @ Here we restore the original q0/q2 usage. 196 vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 197 vtbl.8 d1, {q2}, d3 198 bx lr 199.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 200 201.globl GFp_vpaes_encrypt 202.hidden GFp_vpaes_encrypt 203.type GFp_vpaes_encrypt,%function 204.align 4 205GFp_vpaes_encrypt: 206 @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack 207 @ alignment. 208 stmdb sp!, {r7,r8,r9,r10,r11,lr} 209 @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. 210 vstmdb sp!, {d8,d9,d10,d11} 211 212 vld1.64 {q0}, [r0] 213 bl _vpaes_preheat 214 bl _vpaes_encrypt_core 215 vst1.64 {q0}, [r1] 216 217 vldmia sp!, {d8,d9,d10,d11} 218 ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return 219.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt 220@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 221@@ @@ 222@@ AES key schedule @@ 223@@ @@ 224@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 225 226@ This function diverges from both x86_64 and armv7 in which constants are 227@ pinned. x86_64 has a common preheat function for all operations. aarch64 228@ separates them because it has enough registers to pin nearly all constants. 229@ armv7 does not have enough registers, but needing explicit loads and stores 230@ also complicates using x86_64's register allocation directly. 231@ 232@ We pin some constants for convenience and leave q14 and q15 free to load 233@ others on demand. 234 235@ 236@ Key schedule constants 237@ 238.type _vpaes_key_consts,%object 239.align 4 240_vpaes_key_consts: 241.Lk_rcon:@ rcon 242.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 243 244.Lk_opt:@ output transform 245.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 246.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 247.Lk_deskew:@ deskew tables: inverts the sbox's "skew" 248.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 249.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 250.size _vpaes_key_consts,.-_vpaes_key_consts 251 252.type _vpaes_key_preheat,%function 253.align 4 254_vpaes_key_preheat: 255 adr r11, .Lk_rcon 256 vmov.i8 q12, #0x5b @ .Lk_s63 257 adr r10, .Lk_inv @ Must be aligned to 8 mod 16. 258 vmov.i8 q9, #0x0f @ .Lk_s0F 259 vld1.64 {q10,q11}, [r10] @ .Lk_inv 260 vld1.64 {q8}, [r11] @ .Lk_rcon 261 bx lr 262.size _vpaes_key_preheat,.-_vpaes_key_preheat 263 264.type _vpaes_schedule_core,%function 265.align 4 266_vpaes_schedule_core: 267 @ We only need to save lr, but ARM requires an 8-byte stack alignment, 268 @ so save an extra register. 269 stmdb sp!, {r3,lr} 270 271 bl _vpaes_key_preheat @ load the tables 272 273 adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. 274 vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) 275 276 @ input transform 277 @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not 278 @ overlap table and destination. 279 vmov q4, q0 @ vmovdqa %xmm0, %xmm3 280 bl _vpaes_schedule_transform 281 adr r10, .Lk_sr @ Must be aligned to 8 mod 16. 282 vmov q7, q0 @ vmovdqa %xmm0, %xmm7 283 284 add r8, r8, r10 285 286 @ encrypting, output zeroth round key after transform 287 vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) 288 289 @ *ring*: Decryption removed. 290 291.Lschedule_go: 292 cmp r1, #192 @ cmp $192, %esi 293 bhi .Lschedule_256 294 @ 128: fall though 295 296@@ 297@@ .schedule_128 298@@ 299@@ 128-bit specific part of key schedule. 300@@ 301@@ This schedule is really simple, because all its parts 302@@ are accomplished by the subroutines. 303@@ 304.Lschedule_128: 305 mov r0, #10 @ mov $10, %esi 306 307.Loop_schedule_128: 308 bl _vpaes_schedule_round 309 subs r0, r0, #1 @ dec %esi 310 beq .Lschedule_mangle_last 311 bl _vpaes_schedule_mangle @ write output 312 b .Loop_schedule_128 313 314@@ 315@@ .aes_schedule_256 316@@ 317@@ 256-bit specific part of key schedule. 318@@ 319@@ The structure here is very similar to the 128-bit 320@@ schedule, but with an additional "low side" in 321@@ q6. The low side's rounds are the same as the 322@@ high side's, except no rcon and no rotation. 323@@ 324.align 4 325.Lschedule_256: 326 vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 327 bl _vpaes_schedule_transform @ input transform 328 mov r0, #7 @ mov $7, %esi 329 330.Loop_schedule_256: 331 bl _vpaes_schedule_mangle @ output low result 332 vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 333 334 @ high round 335 bl _vpaes_schedule_round 336 subs r0, r0, #1 @ dec %esi 337 beq .Lschedule_mangle_last 338 bl _vpaes_schedule_mangle 339 340 @ low round. swap xmm7 and xmm6 341 vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 342 vmov.i8 q4, #0 343 vmov q5, q7 @ vmovdqa %xmm7, %xmm5 344 vmov q7, q6 @ vmovdqa %xmm6, %xmm7 345 bl _vpaes_schedule_low_round 346 vmov q7, q5 @ vmovdqa %xmm5, %xmm7 347 348 b .Loop_schedule_256 349 350@@ 351@@ .aes_schedule_mangle_last 352@@ 353@@ Mangler for last round of key schedule 354@@ Mangles q0 355@@ when encrypting, outputs out(q0) ^ 63 356@@ when decrypting, outputs unskew(q0) 357@@ 358@@ Always called right before return... jumps to cleanup and exits 359@@ 360.align 4 361.Lschedule_mangle_last: 362 @ schedule last round key from xmm0 363 adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew 364 365 @ encrypting 366 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 367 adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform 368 add r2, r2, #32 @ add $32, %rdx 369 vmov q2, q0 370 vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute 371 vtbl.8 d1, {q2}, d3 372 373.Lschedule_mangle_last_dec: 374 sub r2, r2, #16 @ add $-16, %rdx 375 veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 376 bl _vpaes_schedule_transform @ output transform 377 vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key 378 379 @ cleanup 380 veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 381 veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 382 veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 383 veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 384 veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 385 veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 386 veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 387 veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 388 ldmia sp!, {r3,pc} @ return 389.size _vpaes_schedule_core,.-_vpaes_schedule_core 390 391@@ 392@@ .aes_schedule_round 393@@ 394@@ Runs one main round of the key schedule on q0, q7 395@@ 396@@ Specifically, runs subbytes on the high dword of q0 397@@ then rotates it by one byte and xors into the low dword of 398@@ q7. 399@@ 400@@ Adds rcon from low byte of q8, then rotates q8 for 401@@ next rcon. 402@@ 403@@ Smears the dwords of q7 by xoring the low into the 404@@ second low, result into third, result into highest. 405@@ 406@@ Returns results in q7 = q0. 407@@ Clobbers q1-q4, r11. 408@@ 409.type _vpaes_schedule_round,%function 410.align 4 411_vpaes_schedule_round: 412 @ extract rcon from xmm8 413 vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 414 vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 415 vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 416 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 417 418 @ rotate 419 vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 420 vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 421 422 @ fall through... 423 424 @ low round: same as high round, but no rotation and no rcon. 425_vpaes_schedule_low_round: 426 @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. 427 @ We pin other values in _vpaes_key_preheat, so load them now. 428 adr r11, .Lk_sb1 429 vld1.64 {q14,q15}, [r11] 430 431 @ smear xmm7 432 vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 433 veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 434 vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 435 436 @ subbytes 437 vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k 438 vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i 439 veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 440 vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 441 vtbl.8 d5, {q11}, d3 442 veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j 443 vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 444 vtbl.8 d7, {q10}, d1 445 veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 446 vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 447 vtbl.8 d9, {q10}, d3 448 veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 449 vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 450 vtbl.8 d7, {q10}, d7 451 veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 452 vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 453 vtbl.8 d5, {q10}, d9 454 veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io 455 veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 456 vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 457 vtbl.8 d9, {q15}, d7 458 vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 459 vtbl.8 d3, {q14}, d5 460 veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 461 462 @ add in smeared stuff 463 veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 464 veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 465 bx lr 466.size _vpaes_schedule_round,.-_vpaes_schedule_round 467 468@@ 469@@ .aes_schedule_transform 470@@ 471@@ Linear-transform q0 according to tables at [r11] 472@@ 473@@ Requires that q9 = 0x0F0F... as in preheat 474@@ Output in q0 475@@ Clobbers q1, q2, q14, q15 476@@ 477.type _vpaes_schedule_transform,%function 478.align 4 479_vpaes_schedule_transform: 480 vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo 481 @ vmovdqa 16(%r11), %xmm1 # hi 482 vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 483 vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 484 vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 485 vtbl.8 d5, {q14}, d3 486 vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 487 vtbl.8 d1, {q15}, d1 488 veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 489 bx lr 490.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 491 492@@ 493@@ .aes_schedule_mangle 494@@ 495@@ Mangles q0 from (basis-transformed) standard version 496@@ to our version. 497@@ 498@@ On encrypt, 499@@ xor with 0x63 500@@ multiply by circulant 0,1,1,1 501@@ apply shiftrows transform 502@@ 503@@ On decrypt, 504@@ xor with 0x63 505@@ multiply by "inverse mixcolumns" circulant E,B,D,9 506@@ deskew 507@@ apply shiftrows transform 508@@ 509@@ 510@@ Writes out to [r2], and increments or decrements it 511@@ Keeps track of round number mod 4 in r8 512@@ Preserves q0 513@@ Clobbers q1-q5 514@@ 515.type _vpaes_schedule_mangle,%function 516.align 4 517_vpaes_schedule_mangle: 518 tst r3, r3 519 vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later 520 adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. 521 vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 522 523 @ encrypting 524 @ Write to q2 so we do not overlap table and destination below. 525 veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 526 add r2, r2, #16 @ add $16, %rdx 527 vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 528 vtbl.8 d9, {q2}, d11 529 vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 530 vtbl.8 d3, {q4}, d11 531 vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 532 vtbl.8 d7, {q1}, d11 533 veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 534 vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 535 veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 536 537.Lschedule_mangle_both: 538 @ Write to q2 so table and destination do not overlap. 539 vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 540 vtbl.8 d5, {q3}, d3 541 add r8, r8, #64-16 @ add $-16, %r8 542 and r8, r8, #~(1<<6) @ and $0x30, %r8 543 vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) 544 bx lr 545.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 546 547.globl GFp_vpaes_set_encrypt_key 548.hidden GFp_vpaes_set_encrypt_key 549.type GFp_vpaes_set_encrypt_key,%function 550.align 4 551GFp_vpaes_set_encrypt_key: 552 stmdb sp!, {r7,r8,r9,r10,r11, lr} 553 vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} 554 555 lsr r9, r1, #5 @ shr $5,%eax 556 add r9, r9, #5 @ $5,%eax 557 str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 558 559 mov r3, #0 @ mov $0,%ecx 560 mov r8, #0x30 @ mov $0x30,%r8d 561 bl _vpaes_schedule_core 562 eor r0, r0, r0 563 564 vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} 565 ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return 566.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key 567 568@ Additional constants for converting to bsaes. 569.type _vpaes_convert_consts,%object 570.align 4 571_vpaes_convert_consts: 572@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear 573@ transform in the AES S-box. 0x63 is incorporated into the low half of the 574@ table. This was computed with the following script: 575@ 576@ def u64s_to_u128(x, y): 577@ return x | (y << 64) 578@ def u128_to_u64s(w): 579@ return w & ((1<<64)-1), w >> 64 580@ def get_byte(w, i): 581@ return (w >> (i*8)) & 0xff 582@ def apply_table(table, b): 583@ lo = b & 0xf 584@ hi = b >> 4 585@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) 586@ def opt(b): 587@ table = [ 588@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), 589@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), 590@ ] 591@ return apply_table(table, b) 592@ def rot_byte(b, n): 593@ return 0xff & ((b << n) | (b >> (8-n))) 594@ def skew(x): 595@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ 596@ rot_byte(x, 4)) 597@ table = [0, 0] 598@ for i in range(16): 599@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) 600@ table[1] |= skew(opt(i<<4)) << (i*8) 601@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) 602@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) 603.Lk_opt_then_skew: 604.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b 605.quad 0x1f30062936192f00, 0xb49bad829db284ab 606 607@ void GFp_vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); 608.globl GFp_vpaes_encrypt_key_to_bsaes 609.hidden GFp_vpaes_encrypt_key_to_bsaes 610.type GFp_vpaes_encrypt_key_to_bsaes,%function 611.align 4 612GFp_vpaes_encrypt_key_to_bsaes: 613 stmdb sp!, {r11, lr} 614 615 @ See _vpaes_schedule_core for the key schedule logic. In particular, 616 @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), 617 @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last 618 @ contain the transformations not in the bsaes representation. This 619 @ function inverts those transforms. 620 @ 621 @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key 622 @ representation, which does not match the other aes_nohw_* 623 @ implementations. The ARM aes_nohw_* stores each 32-bit word 624 @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the 625 @ cost of extra REV and VREV32 operations in little-endian ARM. 626 627 vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform 628 adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. 629 add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) 630 631 vld1.64 {q12}, [r2] 632 vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64 633 adr r11, .Lk_opt @ Must be aligned to 8 mod 16. 634 vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied 635 636 @ vpaes stores one fewer round count than bsaes, but the number of keys 637 @ is the same. 638 ldr r2, [r1,#240] 639 add r2, r2, #1 640 str r2, [r0,#240] 641 642 @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). 643 @ Invert this with .Lk_opt. 644 vld1.64 {q0}, [r1]! 645 bl _vpaes_schedule_transform 646 vrev32.8 q0, q0 647 vst1.64 {q0}, [r0]! 648 649 @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, 650 @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, 651 @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. 652.Loop_enc_key_to_bsaes: 653 vld1.64 {q0}, [r1]! 654 655 @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle 656 @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. 657 @ We use r3 rather than r8 to avoid a callee-saved register. 658 vld1.64 {q1}, [r3] 659 vtbl.8 d4, {q0}, d2 660 vtbl.8 d5, {q0}, d3 661 add r3, r3, #16 662 and r3, r3, #~(1<<6) 663 vmov q0, q2 664 665 @ Handle the last key differently. 666 subs r2, r2, #1 667 beq .Loop_enc_key_to_bsaes_last 668 669 @ Multiply by the circulant. This is its own inverse. 670 vtbl.8 d2, {q0}, d24 671 vtbl.8 d3, {q0}, d25 672 vmov q0, q1 673 vtbl.8 d4, {q1}, d24 674 vtbl.8 d5, {q1}, d25 675 veor q0, q0, q2 676 vtbl.8 d2, {q2}, d24 677 vtbl.8 d3, {q2}, d25 678 veor q0, q0, q1 679 680 @ XOR and finish. 681 veor q0, q0, q10 682 bl _vpaes_schedule_transform 683 vrev32.8 q0, q0 684 vst1.64 {q0}, [r0]! 685 b .Loop_enc_key_to_bsaes 686 687.Loop_enc_key_to_bsaes_last: 688 @ The final key does not have a basis transform (note 689 @ .Lschedule_mangle_last inverts the original transform). It only XORs 690 @ 0x63 and applies ShiftRows. The latter was already inverted in the 691 @ loop. Note that, because we act on the original representation, we use 692 @ q11, not q10. 693 veor q0, q0, q11 694 vrev32.8 q0, q0 695 vst1.64 {q0}, [r0] 696 697 @ Wipe registers which contained key material. 698 veor q0, q0, q0 699 veor q1, q1, q1 700 veor q2, q2, q2 701 702 ldmia sp!, {r11, pc} @ return 703.size GFp_vpaes_encrypt_key_to_bsaes,.-GFp_vpaes_encrypt_key_to_bsaes 704.globl GFp_vpaes_ctr32_encrypt_blocks 705.hidden GFp_vpaes_ctr32_encrypt_blocks 706.type GFp_vpaes_ctr32_encrypt_blocks,%function 707.align 4 708GFp_vpaes_ctr32_encrypt_blocks: 709 mov ip, sp 710 stmdb sp!, {r7,r8,r9,r10,r11, lr} 711 @ This function uses q4-q7 (d8-d15), which are callee-saved. 712 vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} 713 714 cmp r2, #0 715 @ r8 is passed on the stack. 716 ldr r8, [ip] 717 beq .Lctr32_done 718 719 @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. 720 mov r9, r3 721 mov r3, r2 722 mov r2, r9 723 724 @ Load the IV and counter portion. 725 ldr r7, [r8, #12] 726 vld1.8 {q7}, [r8] 727 728 bl _vpaes_preheat 729 rev r7, r7 @ The counter is big-endian. 730 731.Lctr32_loop: 732 vmov q0, q7 733 vld1.8 {q6}, [r0]! @ .Load input ahead of time 734 bl _vpaes_encrypt_core 735 veor q0, q0, q6 @ XOR input and result 736 vst1.8 {q0}, [r1]! 737 subs r3, r3, #1 738 @ Update the counter. 739 add r7, r7, #1 740 rev r9, r7 741 vmov.32 d15[1], r9 742 bne .Lctr32_loop 743 744.Lctr32_done: 745 vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} 746 ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return 747.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks 748#endif 749#endif // !OPENSSL_NO_ASM 750.section .note.GNU-stack,"",%progbits 751