1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# 39# (*) original 3.64/1.34/1.32 results were for r0p0 revision 40# and are still same even for updated module; 41 42$flavour = shift; 43$output = shift; 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 48die "can't locate arm-xlate.pl"; 49 50open OUT,"| \"$^X\" $xlate $flavour $output"; 51*STDOUT=*OUT; 52 53$prefix="aes_hw"; 54 55$code=<<___; 56#include <openssl/arm_arch.h> 57 58#if __ARM_MAX_ARCH__>=7 59.text 60___ 61$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 62$code.=<<___ if ($flavour !~ /64/); 63.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 64.fpu neon 65.code 32 66#undef __thumb2__ 67___ 68 69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 71# maintain both 32- and 64-bit codes within single module and 72# transliterate common code to either flavour with regex vodoo. 73# 74{{{ 75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 77 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 78 79 80# On AArch64, put the data .rodata and use adrp + add for compatibility with 81# execute-only memory. On AArch32, put it in .text and use adr. 82$code.= ".section .rodata\n" if ($flavour =~ /64/); 83$code.=<<___; 84.align 5 85.Lrcon: 86.long 0x01,0x01,0x01,0x01 87.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 88.long 0x1b,0x1b,0x1b,0x1b 89 90.text 91 92.globl ${prefix}_set_encrypt_key 93.type ${prefix}_set_encrypt_key,%function 94.align 5 95${prefix}_set_encrypt_key: 96.Lenc_key: 97___ 98$code.=<<___ if ($flavour =~ /64/); 99 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 100 AARCH64_VALID_CALL_TARGET 101 stp x29,x30,[sp,#-16]! 102 add x29,sp,#0 103___ 104$code.=<<___; 105 mov $ptr,#-1 106 cmp $inp,#0 107 b.eq .Lenc_key_abort 108 cmp $out,#0 109 b.eq .Lenc_key_abort 110 mov $ptr,#-2 111 cmp $bits,#128 112 b.lt .Lenc_key_abort 113 cmp $bits,#256 114 b.gt .Lenc_key_abort 115 tst $bits,#0x3f 116 b.ne .Lenc_key_abort 117 118___ 119$code.=<<___ if ($flavour =~ /64/); 120 adrp $ptr,:pg_hi21:.Lrcon 121 add $ptr,$ptr,:lo12:.Lrcon 122___ 123$code.=<<___ if ($flavour !~ /64/); 124 adr $ptr,.Lrcon 125___ 126$code.=<<___; 127 cmp $bits,#192 128 129 veor $zero,$zero,$zero 130 vld1.8 {$in0},[$inp],#16 131 mov $bits,#8 // reuse $bits 132 vld1.32 {$rcon,$mask},[$ptr],#32 133 134 b.lt .Loop128 135 b.eq .L192 136 b .L256 137 138.align 4 139.Loop128: 140 vtbl.8 $key,{$in0},$mask 141 vext.8 $tmp,$zero,$in0,#12 142 vst1.32 {$in0},[$out],#16 143 aese $key,$zero 144 subs $bits,$bits,#1 145 146 veor $in0,$in0,$tmp 147 vext.8 $tmp,$zero,$tmp,#12 148 veor $in0,$in0,$tmp 149 vext.8 $tmp,$zero,$tmp,#12 150 veor $key,$key,$rcon 151 veor $in0,$in0,$tmp 152 vshl.u8 $rcon,$rcon,#1 153 veor $in0,$in0,$key 154 b.ne .Loop128 155 156 vld1.32 {$rcon},[$ptr] 157 158 vtbl.8 $key,{$in0},$mask 159 vext.8 $tmp,$zero,$in0,#12 160 vst1.32 {$in0},[$out],#16 161 aese $key,$zero 162 163 veor $in0,$in0,$tmp 164 vext.8 $tmp,$zero,$tmp,#12 165 veor $in0,$in0,$tmp 166 vext.8 $tmp,$zero,$tmp,#12 167 veor $key,$key,$rcon 168 veor $in0,$in0,$tmp 169 vshl.u8 $rcon,$rcon,#1 170 veor $in0,$in0,$key 171 172 vtbl.8 $key,{$in0},$mask 173 vext.8 $tmp,$zero,$in0,#12 174 vst1.32 {$in0},[$out],#16 175 aese $key,$zero 176 177 veor $in0,$in0,$tmp 178 vext.8 $tmp,$zero,$tmp,#12 179 veor $in0,$in0,$tmp 180 vext.8 $tmp,$zero,$tmp,#12 181 veor $key,$key,$rcon 182 veor $in0,$in0,$tmp 183 veor $in0,$in0,$key 184 vst1.32 {$in0},[$out] 185 add $out,$out,#0x50 186 187 mov $rounds,#10 188 b .Ldone 189 190.align 4 191.L192: 192 vld1.8 {$in1},[$inp],#8 193 vmov.i8 $key,#8 // borrow $key 194 vst1.32 {$in0},[$out],#16 195 vsub.i8 $mask,$mask,$key // adjust the mask 196 197.Loop192: 198 vtbl.8 $key,{$in1},$mask 199 vext.8 $tmp,$zero,$in0,#12 200 vst1.32 {$in1},[$out],#8 201 aese $key,$zero 202 subs $bits,$bits,#1 203 204 veor $in0,$in0,$tmp 205 vext.8 $tmp,$zero,$tmp,#12 206 veor $in0,$in0,$tmp 207 vext.8 $tmp,$zero,$tmp,#12 208 veor $in0,$in0,$tmp 209 210 vdup.32 $tmp,${in0}[3] 211 veor $tmp,$tmp,$in1 212 veor $key,$key,$rcon 213 vext.8 $in1,$zero,$in1,#12 214 vshl.u8 $rcon,$rcon,#1 215 veor $in1,$in1,$tmp 216 veor $in0,$in0,$key 217 veor $in1,$in1,$key 218 vst1.32 {$in0},[$out],#16 219 b.ne .Loop192 220 221 mov $rounds,#12 222 add $out,$out,#0x20 223 b .Ldone 224 225.align 4 226.L256: 227 vld1.8 {$in1},[$inp] 228 mov $bits,#7 229 mov $rounds,#14 230 vst1.32 {$in0},[$out],#16 231 232.Loop256: 233 vtbl.8 $key,{$in1},$mask 234 vext.8 $tmp,$zero,$in0,#12 235 vst1.32 {$in1},[$out],#16 236 aese $key,$zero 237 subs $bits,$bits,#1 238 239 veor $in0,$in0,$tmp 240 vext.8 $tmp,$zero,$tmp,#12 241 veor $in0,$in0,$tmp 242 vext.8 $tmp,$zero,$tmp,#12 243 veor $key,$key,$rcon 244 veor $in0,$in0,$tmp 245 vshl.u8 $rcon,$rcon,#1 246 veor $in0,$in0,$key 247 vst1.32 {$in0},[$out],#16 248 b.eq .Ldone 249 250 vdup.32 $key,${in0}[3] // just splat 251 vext.8 $tmp,$zero,$in1,#12 252 aese $key,$zero 253 254 veor $in1,$in1,$tmp 255 vext.8 $tmp,$zero,$tmp,#12 256 veor $in1,$in1,$tmp 257 vext.8 $tmp,$zero,$tmp,#12 258 veor $in1,$in1,$tmp 259 260 veor $in1,$in1,$key 261 b .Loop256 262 263.Ldone: 264 str $rounds,[$out] 265 mov $ptr,#0 266 267.Lenc_key_abort: 268 mov x0,$ptr // return value 269 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 270 ret 271.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 272 273.globl ${prefix}_set_decrypt_key 274.type ${prefix}_set_decrypt_key,%function 275.align 5 276${prefix}_set_decrypt_key: 277___ 278$code.=<<___ if ($flavour =~ /64/); 279 AARCH64_SIGN_LINK_REGISTER 280 stp x29,x30,[sp,#-16]! 281 add x29,sp,#0 282___ 283$code.=<<___ if ($flavour !~ /64/); 284 stmdb sp!,{r4,lr} 285___ 286$code.=<<___; 287 bl .Lenc_key 288 289 cmp x0,#0 290 b.ne .Ldec_key_abort 291 292 sub $out,$out,#240 // restore original $out 293 mov x4,#-16 294 add $inp,$out,x12,lsl#4 // end of key schedule 295 296 vld1.32 {v0.16b},[$out] 297 vld1.32 {v1.16b},[$inp] 298 vst1.32 {v0.16b},[$inp],x4 299 vst1.32 {v1.16b},[$out],#16 300 301.Loop_imc: 302 vld1.32 {v0.16b},[$out] 303 vld1.32 {v1.16b},[$inp] 304 aesimc v0.16b,v0.16b 305 aesimc v1.16b,v1.16b 306 vst1.32 {v0.16b},[$inp],x4 307 vst1.32 {v1.16b},[$out],#16 308 cmp $inp,$out 309 b.hi .Loop_imc 310 311 vld1.32 {v0.16b},[$out] 312 aesimc v0.16b,v0.16b 313 vst1.32 {v0.16b},[$inp] 314 315 eor x0,x0,x0 // return value 316.Ldec_key_abort: 317___ 318$code.=<<___ if ($flavour !~ /64/); 319 ldmia sp!,{r4,pc} 320___ 321$code.=<<___ if ($flavour =~ /64/); 322 ldp x29,x30,[sp],#16 323 AARCH64_VALIDATE_LINK_REGISTER 324 ret 325___ 326$code.=<<___; 327.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 328___ 329}}} 330{{{ 331sub gen_block () { 332my $dir = shift; 333my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 334my ($inp,$out,$key)=map("x$_",(0..2)); 335my $rounds="w3"; 336my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 337 338$code.=<<___; 339.globl ${prefix}_${dir}crypt 340.type ${prefix}_${dir}crypt,%function 341.align 5 342${prefix}_${dir}crypt: 343 AARCH64_VALID_CALL_TARGET 344 ldr $rounds,[$key,#240] 345 vld1.32 {$rndkey0},[$key],#16 346 vld1.8 {$inout},[$inp] 347 sub $rounds,$rounds,#2 348 vld1.32 {$rndkey1},[$key],#16 349 350.Loop_${dir}c: 351 aes$e $inout,$rndkey0 352 aes$mc $inout,$inout 353 vld1.32 {$rndkey0},[$key],#16 354 subs $rounds,$rounds,#2 355 aes$e $inout,$rndkey1 356 aes$mc $inout,$inout 357 vld1.32 {$rndkey1},[$key],#16 358 b.gt .Loop_${dir}c 359 360 aes$e $inout,$rndkey0 361 aes$mc $inout,$inout 362 vld1.32 {$rndkey0},[$key] 363 aes$e $inout,$rndkey1 364 veor $inout,$inout,$rndkey0 365 366 vst1.8 {$inout},[$out] 367 ret 368.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 369___ 370} 371&gen_block("en"); 372&gen_block("de"); 373}}} 374{{{ 375my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 376my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 377my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 378 379my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 380my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 381 382### q8-q15 preloaded key schedule 383 384$code.=<<___; 385.globl ${prefix}_cbc_encrypt 386.type ${prefix}_cbc_encrypt,%function 387.align 5 388${prefix}_cbc_encrypt: 389___ 390$code.=<<___ if ($flavour =~ /64/); 391 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 392 AARCH64_VALID_CALL_TARGET 393 stp x29,x30,[sp,#-16]! 394 add x29,sp,#0 395___ 396$code.=<<___ if ($flavour !~ /64/); 397 mov ip,sp 398 stmdb sp!,{r4-r8,lr} 399 vstmdb sp!,{d8-d15} @ ABI specification says so 400 ldmia ip,{r4-r5} @ load remaining args 401___ 402$code.=<<___; 403 subs $len,$len,#16 404 mov $step,#16 405 b.lo .Lcbc_abort 406 cclr $step,eq 407 408 cmp $enc,#0 // en- or decrypting? 409 ldr $rounds,[$key,#240] 410 and $len,$len,#-16 411 vld1.8 {$ivec},[$ivp] 412 vld1.8 {$dat},[$inp],$step 413 414 vld1.32 {q8-q9},[$key] // load key schedule... 415 sub $rounds,$rounds,#6 416 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 417 sub $rounds,$rounds,#2 418 vld1.32 {q10-q11},[$key_],#32 419 vld1.32 {q12-q13},[$key_],#32 420 vld1.32 {q14-q15},[$key_],#32 421 vld1.32 {$rndlast},[$key_] 422 423 add $key_,$key,#32 424 mov $cnt,$rounds 425 b.eq .Lcbc_dec 426 427 cmp $rounds,#2 428 veor $dat,$dat,$ivec 429 veor $rndzero_n_last,q8,$rndlast 430 b.eq .Lcbc_enc128 431 432 vld1.32 {$in0-$in1},[$key_] 433 add $key_,$key,#16 434 add $key4,$key,#16*4 435 add $key5,$key,#16*5 436 aese $dat,q8 437 aesmc $dat,$dat 438 add $key6,$key,#16*6 439 add $key7,$key,#16*7 440 b .Lenter_cbc_enc 441 442.align 4 443.Loop_cbc_enc: 444 aese $dat,q8 445 aesmc $dat,$dat 446 vst1.8 {$ivec},[$out],#16 447.Lenter_cbc_enc: 448 aese $dat,q9 449 aesmc $dat,$dat 450 aese $dat,$in0 451 aesmc $dat,$dat 452 vld1.32 {q8},[$key4] 453 cmp $rounds,#4 454 aese $dat,$in1 455 aesmc $dat,$dat 456 vld1.32 {q9},[$key5] 457 b.eq .Lcbc_enc192 458 459 aese $dat,q8 460 aesmc $dat,$dat 461 vld1.32 {q8},[$key6] 462 aese $dat,q9 463 aesmc $dat,$dat 464 vld1.32 {q9},[$key7] 465 nop 466 467.Lcbc_enc192: 468 aese $dat,q8 469 aesmc $dat,$dat 470 subs $len,$len,#16 471 aese $dat,q9 472 aesmc $dat,$dat 473 cclr $step,eq 474 aese $dat,q10 475 aesmc $dat,$dat 476 aese $dat,q11 477 aesmc $dat,$dat 478 vld1.8 {q8},[$inp],$step 479 aese $dat,q12 480 aesmc $dat,$dat 481 veor q8,q8,$rndzero_n_last 482 aese $dat,q13 483 aesmc $dat,$dat 484 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 485 aese $dat,q14 486 aesmc $dat,$dat 487 aese $dat,q15 488 veor $ivec,$dat,$rndlast 489 b.hs .Loop_cbc_enc 490 491 vst1.8 {$ivec},[$out],#16 492 b .Lcbc_done 493 494.align 5 495.Lcbc_enc128: 496 vld1.32 {$in0-$in1},[$key_] 497 aese $dat,q8 498 aesmc $dat,$dat 499 b .Lenter_cbc_enc128 500.Loop_cbc_enc128: 501 aese $dat,q8 502 aesmc $dat,$dat 503 vst1.8 {$ivec},[$out],#16 504.Lenter_cbc_enc128: 505 aese $dat,q9 506 aesmc $dat,$dat 507 subs $len,$len,#16 508 aese $dat,$in0 509 aesmc $dat,$dat 510 cclr $step,eq 511 aese $dat,$in1 512 aesmc $dat,$dat 513 aese $dat,q10 514 aesmc $dat,$dat 515 aese $dat,q11 516 aesmc $dat,$dat 517 vld1.8 {q8},[$inp],$step 518 aese $dat,q12 519 aesmc $dat,$dat 520 aese $dat,q13 521 aesmc $dat,$dat 522 aese $dat,q14 523 aesmc $dat,$dat 524 veor q8,q8,$rndzero_n_last 525 aese $dat,q15 526 veor $ivec,$dat,$rndlast 527 b.hs .Loop_cbc_enc128 528 529 vst1.8 {$ivec},[$out],#16 530 b .Lcbc_done 531___ 532{ 533my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 534$code.=<<___; 535.align 5 536.Lcbc_dec: 537 vld1.8 {$dat2},[$inp],#16 538 subs $len,$len,#32 // bias 539 add $cnt,$rounds,#2 540 vorr $in1,$dat,$dat 541 vorr $dat1,$dat,$dat 542 vorr $in2,$dat2,$dat2 543 b.lo .Lcbc_dec_tail 544 545 vorr $dat1,$dat2,$dat2 546 vld1.8 {$dat2},[$inp],#16 547 vorr $in0,$dat,$dat 548 vorr $in1,$dat1,$dat1 549 vorr $in2,$dat2,$dat2 550 551.Loop3x_cbc_dec: 552 aesd $dat0,q8 553 aesimc $dat0,$dat0 554 aesd $dat1,q8 555 aesimc $dat1,$dat1 556 aesd $dat2,q8 557 aesimc $dat2,$dat2 558 vld1.32 {q8},[$key_],#16 559 subs $cnt,$cnt,#2 560 aesd $dat0,q9 561 aesimc $dat0,$dat0 562 aesd $dat1,q9 563 aesimc $dat1,$dat1 564 aesd $dat2,q9 565 aesimc $dat2,$dat2 566 vld1.32 {q9},[$key_],#16 567 b.gt .Loop3x_cbc_dec 568 569 aesd $dat0,q8 570 aesimc $dat0,$dat0 571 aesd $dat1,q8 572 aesimc $dat1,$dat1 573 aesd $dat2,q8 574 aesimc $dat2,$dat2 575 veor $tmp0,$ivec,$rndlast 576 subs $len,$len,#0x30 577 veor $tmp1,$in0,$rndlast 578 mov.lo x6,$len // x6, $cnt, is zero at this point 579 aesd $dat0,q9 580 aesimc $dat0,$dat0 581 aesd $dat1,q9 582 aesimc $dat1,$dat1 583 aesd $dat2,q9 584 aesimc $dat2,$dat2 585 veor $tmp2,$in1,$rndlast 586 add $inp,$inp,x6 // $inp is adjusted in such way that 587 // at exit from the loop $dat1-$dat2 588 // are loaded with last "words" 589 vorr $ivec,$in2,$in2 590 mov $key_,$key 591 aesd $dat0,q12 592 aesimc $dat0,$dat0 593 aesd $dat1,q12 594 aesimc $dat1,$dat1 595 aesd $dat2,q12 596 aesimc $dat2,$dat2 597 vld1.8 {$in0},[$inp],#16 598 aesd $dat0,q13 599 aesimc $dat0,$dat0 600 aesd $dat1,q13 601 aesimc $dat1,$dat1 602 aesd $dat2,q13 603 aesimc $dat2,$dat2 604 vld1.8 {$in1},[$inp],#16 605 aesd $dat0,q14 606 aesimc $dat0,$dat0 607 aesd $dat1,q14 608 aesimc $dat1,$dat1 609 aesd $dat2,q14 610 aesimc $dat2,$dat2 611 vld1.8 {$in2},[$inp],#16 612 aesd $dat0,q15 613 aesd $dat1,q15 614 aesd $dat2,q15 615 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 616 add $cnt,$rounds,#2 617 veor $tmp0,$tmp0,$dat0 618 veor $tmp1,$tmp1,$dat1 619 veor $dat2,$dat2,$tmp2 620 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 621 vst1.8 {$tmp0},[$out],#16 622 vorr $dat0,$in0,$in0 623 vst1.8 {$tmp1},[$out],#16 624 vorr $dat1,$in1,$in1 625 vst1.8 {$dat2},[$out],#16 626 vorr $dat2,$in2,$in2 627 b.hs .Loop3x_cbc_dec 628 629 cmn $len,#0x30 630 b.eq .Lcbc_done 631 nop 632 633.Lcbc_dec_tail: 634 aesd $dat1,q8 635 aesimc $dat1,$dat1 636 aesd $dat2,q8 637 aesimc $dat2,$dat2 638 vld1.32 {q8},[$key_],#16 639 subs $cnt,$cnt,#2 640 aesd $dat1,q9 641 aesimc $dat1,$dat1 642 aesd $dat2,q9 643 aesimc $dat2,$dat2 644 vld1.32 {q9},[$key_],#16 645 b.gt .Lcbc_dec_tail 646 647 aesd $dat1,q8 648 aesimc $dat1,$dat1 649 aesd $dat2,q8 650 aesimc $dat2,$dat2 651 aesd $dat1,q9 652 aesimc $dat1,$dat1 653 aesd $dat2,q9 654 aesimc $dat2,$dat2 655 aesd $dat1,q12 656 aesimc $dat1,$dat1 657 aesd $dat2,q12 658 aesimc $dat2,$dat2 659 cmn $len,#0x20 660 aesd $dat1,q13 661 aesimc $dat1,$dat1 662 aesd $dat2,q13 663 aesimc $dat2,$dat2 664 veor $tmp1,$ivec,$rndlast 665 aesd $dat1,q14 666 aesimc $dat1,$dat1 667 aesd $dat2,q14 668 aesimc $dat2,$dat2 669 veor $tmp2,$in1,$rndlast 670 aesd $dat1,q15 671 aesd $dat2,q15 672 b.eq .Lcbc_dec_one 673 veor $tmp1,$tmp1,$dat1 674 veor $tmp2,$tmp2,$dat2 675 vorr $ivec,$in2,$in2 676 vst1.8 {$tmp1},[$out],#16 677 vst1.8 {$tmp2},[$out],#16 678 b .Lcbc_done 679 680.Lcbc_dec_one: 681 veor $tmp1,$tmp1,$dat2 682 vorr $ivec,$in2,$in2 683 vst1.8 {$tmp1},[$out],#16 684 685.Lcbc_done: 686 vst1.8 {$ivec},[$ivp] 687.Lcbc_abort: 688___ 689} 690$code.=<<___ if ($flavour !~ /64/); 691 vldmia sp!,{d8-d15} 692 ldmia sp!,{r4-r8,pc} 693___ 694$code.=<<___ if ($flavour =~ /64/); 695 ldr x29,[sp],#16 696 ret 697___ 698$code.=<<___; 699.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 700___ 701}}} 702{{{ 703my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 704my ($rounds,$cnt,$key_)=("w5","w6","x7"); 705my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 706my $step="x12"; # aliases with $tctr2 707 708my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 709my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 710 711my ($dat,$tmp)=($dat0,$tmp0); 712 713### q8-q15 preloaded key schedule 714 715$code.=<<___; 716.globl ${prefix}_ctr32_encrypt_blocks 717.type ${prefix}_ctr32_encrypt_blocks,%function 718.align 5 719${prefix}_ctr32_encrypt_blocks: 720___ 721$code.=<<___ if ($flavour =~ /64/); 722 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 723 AARCH64_VALID_CALL_TARGET 724 stp x29,x30,[sp,#-16]! 725 add x29,sp,#0 726___ 727$code.=<<___ if ($flavour !~ /64/); 728 mov ip,sp 729 stmdb sp!,{r4-r10,lr} 730 vstmdb sp!,{d8-d15} @ ABI specification says so 731 ldr r4, [ip] @ load remaining arg 732___ 733$code.=<<___; 734 ldr $rounds,[$key,#240] 735 736 ldr $ctr, [$ivp, #12] 737 vld1.32 {$dat0},[$ivp] 738 739 vld1.32 {q8-q9},[$key] // load key schedule... 740 sub $rounds,$rounds,#4 741 mov $step,#16 742 cmp $len,#2 743 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 744 sub $rounds,$rounds,#2 745 vld1.32 {q12-q13},[$key_],#32 746 vld1.32 {q14-q15},[$key_],#32 747 vld1.32 {$rndlast},[$key_] 748 add $key_,$key,#32 749 mov $cnt,$rounds 750 cclr $step,lo 751 752 // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are 753 // affected by silicon errata #1742098 [0] and #1655431 [1], 754 // respectively, where the second instruction of an aese/aesmc 755 // instruction pair may execute twice if an interrupt is taken right 756 // after the first instruction consumes an input register of which a 757 // single 32-bit lane has been updated the last time it was modified. 758 // 759 // This function uses a counter in one 32-bit lane. The vmov.32 lines 760 // could write to $dat1 and $dat2 directly, but that trips this bugs. 761 // We write to $ivec and copy to the final register as a workaround. 762 // 763 // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice 764 // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice 765#ifndef __ARMEB__ 766 rev $ctr, $ctr 767#endif 768 add $tctr1, $ctr, #1 769 vorr $ivec,$dat0,$dat0 770 rev $tctr1, $tctr1 771 vmov.32 ${ivec}[3],$tctr1 772 add $ctr, $ctr, #2 773 vorr $dat1,$ivec,$ivec 774 b.ls .Lctr32_tail 775 rev $tctr2, $ctr 776 vmov.32 ${ivec}[3],$tctr2 777 sub $len,$len,#3 // bias 778 vorr $dat2,$ivec,$ivec 779 b .Loop3x_ctr32 780 781.align 4 782.Loop3x_ctr32: 783 aese $dat0,q8 784 aesmc $dat0,$dat0 785 aese $dat1,q8 786 aesmc $dat1,$dat1 787 aese $dat2,q8 788 aesmc $dat2,$dat2 789 vld1.32 {q8},[$key_],#16 790 subs $cnt,$cnt,#2 791 aese $dat0,q9 792 aesmc $dat0,$dat0 793 aese $dat1,q9 794 aesmc $dat1,$dat1 795 aese $dat2,q9 796 aesmc $dat2,$dat2 797 vld1.32 {q9},[$key_],#16 798 b.gt .Loop3x_ctr32 799 800 aese $dat0,q8 801 aesmc $tmp0,$dat0 802 aese $dat1,q8 803 aesmc $tmp1,$dat1 804 vld1.8 {$in0},[$inp],#16 805 add $tctr0,$ctr,#1 806 aese $dat2,q8 807 aesmc $dat2,$dat2 808 vld1.8 {$in1},[$inp],#16 809 rev $tctr0,$tctr0 810 aese $tmp0,q9 811 aesmc $tmp0,$tmp0 812 aese $tmp1,q9 813 aesmc $tmp1,$tmp1 814 vld1.8 {$in2},[$inp],#16 815 mov $key_,$key 816 aese $dat2,q9 817 aesmc $tmp2,$dat2 818 aese $tmp0,q12 819 aesmc $tmp0,$tmp0 820 aese $tmp1,q12 821 aesmc $tmp1,$tmp1 822 veor $in0,$in0,$rndlast 823 add $tctr1,$ctr,#2 824 aese $tmp2,q12 825 aesmc $tmp2,$tmp2 826 veor $in1,$in1,$rndlast 827 add $ctr,$ctr,#3 828 aese $tmp0,q13 829 aesmc $tmp0,$tmp0 830 aese $tmp1,q13 831 aesmc $tmp1,$tmp1 832 // Note the logic to update $dat0, $dat1, and $dat1 is written to work 833 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in 834 // 32-bit mode. See the comment above. 835 veor $in2,$in2,$rndlast 836 vmov.32 ${ivec}[3], $tctr0 837 aese $tmp2,q13 838 aesmc $tmp2,$tmp2 839 vorr $dat0,$ivec,$ivec 840 rev $tctr1,$tctr1 841 aese $tmp0,q14 842 aesmc $tmp0,$tmp0 843 vmov.32 ${ivec}[3], $tctr1 844 rev $tctr2,$ctr 845 aese $tmp1,q14 846 aesmc $tmp1,$tmp1 847 vorr $dat1,$ivec,$ivec 848 vmov.32 ${ivec}[3], $tctr2 849 aese $tmp2,q14 850 aesmc $tmp2,$tmp2 851 vorr $dat2,$ivec,$ivec 852 subs $len,$len,#3 853 aese $tmp0,q15 854 aese $tmp1,q15 855 aese $tmp2,q15 856 857 veor $in0,$in0,$tmp0 858 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 859 vst1.8 {$in0},[$out],#16 860 veor $in1,$in1,$tmp1 861 mov $cnt,$rounds 862 vst1.8 {$in1},[$out],#16 863 veor $in2,$in2,$tmp2 864 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 865 vst1.8 {$in2},[$out],#16 866 b.hs .Loop3x_ctr32 867 868 adds $len,$len,#3 869 b.eq .Lctr32_done 870 cmp $len,#1 871 mov $step,#16 872 cclr $step,eq 873 874.Lctr32_tail: 875 aese $dat0,q8 876 aesmc $dat0,$dat0 877 aese $dat1,q8 878 aesmc $dat1,$dat1 879 vld1.32 {q8},[$key_],#16 880 subs $cnt,$cnt,#2 881 aese $dat0,q9 882 aesmc $dat0,$dat0 883 aese $dat1,q9 884 aesmc $dat1,$dat1 885 vld1.32 {q9},[$key_],#16 886 b.gt .Lctr32_tail 887 888 aese $dat0,q8 889 aesmc $dat0,$dat0 890 aese $dat1,q8 891 aesmc $dat1,$dat1 892 aese $dat0,q9 893 aesmc $dat0,$dat0 894 aese $dat1,q9 895 aesmc $dat1,$dat1 896 vld1.8 {$in0},[$inp],$step 897 aese $dat0,q12 898 aesmc $dat0,$dat0 899 aese $dat1,q12 900 aesmc $dat1,$dat1 901 vld1.8 {$in1},[$inp] 902 aese $dat0,q13 903 aesmc $dat0,$dat0 904 aese $dat1,q13 905 aesmc $dat1,$dat1 906 veor $in0,$in0,$rndlast 907 aese $dat0,q14 908 aesmc $dat0,$dat0 909 aese $dat1,q14 910 aesmc $dat1,$dat1 911 veor $in1,$in1,$rndlast 912 aese $dat0,q15 913 aese $dat1,q15 914 915 cmp $len,#1 916 veor $in0,$in0,$dat0 917 veor $in1,$in1,$dat1 918 vst1.8 {$in0},[$out],#16 919 b.eq .Lctr32_done 920 vst1.8 {$in1},[$out] 921 922.Lctr32_done: 923___ 924$code.=<<___ if ($flavour !~ /64/); 925 vldmia sp!,{d8-d15} 926 ldmia sp!,{r4-r10,pc} 927___ 928$code.=<<___ if ($flavour =~ /64/); 929 ldr x29,[sp],#16 930 ret 931___ 932$code.=<<___; 933.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 934___ 935}}} 936$code.=<<___; 937#endif 938___ 939######################################## 940if ($flavour =~ /64/) { ######## 64-bit code 941 my %opcode = ( 942 "aesd" => 0x4e285800, "aese" => 0x4e284800, 943 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 944 945 local *unaes = sub { 946 my ($mnemonic,$arg)=@_; 947 948 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 949 sprintf ".inst\t0x%08x\t//%s %s", 950 $opcode{$mnemonic}|$1|($2<<5), 951 $mnemonic,$arg; 952 }; 953 954 foreach(split("\n",$code)) { 955 s/\`([^\`]*)\`/eval($1)/geo; 956 957 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 958 s/@\s/\/\//o; # old->new style commentary 959 960 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 961 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 962 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 963 s/vmov\.i8/movi/o or # fix up legacy mnemonics 964 s/vext\.8/ext/o or 965 s/vrev32\.8/rev32/o or 966 s/vtst\.8/cmtst/o or 967 s/vshr/ushr/o or 968 s/^(\s+)v/$1/o or # strip off v prefix 969 s/\bbx\s+lr\b/ret/o; 970 971 # fix up remaining legacy suffixes 972 s/\.[ui]?8//o; 973 m/\],#8/o and s/\.16b/\.8b/go; 974 s/\.[ui]?32//o and s/\.16b/\.4s/go; 975 s/\.[ui]?64//o and s/\.16b/\.2d/go; 976 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 977 978 print $_,"\n"; 979 } 980} else { ######## 32-bit code 981 my %opcode = ( 982 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 983 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 984 985 local *unaes = sub { 986 my ($mnemonic,$arg)=@_; 987 988 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 989 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 990 |(($2&7)<<1) |(($2&8)<<2); 991 # since ARMv7 instructions are always encoded little-endian. 992 # correct solution is to use .inst directive, but older 993 # assemblers don't implement it:-( 994 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 995 $word&0xff,($word>>8)&0xff, 996 ($word>>16)&0xff,($word>>24)&0xff, 997 $mnemonic,$arg; 998 } 999 }; 1000 1001 sub unvtbl { 1002 my $arg=shift; 1003 1004 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 1005 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 1006 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 1007 } 1008 1009 sub unvdup32 { 1010 my $arg=shift; 1011 1012 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 1013 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 1014 } 1015 1016 sub unvmov32 { 1017 my $arg=shift; 1018 1019 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 1020 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 1021 } 1022 1023 foreach(split("\n",$code)) { 1024 s/\`([^\`]*)\`/eval($1)/geo; 1025 1026 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 1027 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 1028 s/\/\/\s?/@ /o; # new->old style commentary 1029 1030 # fix up remaining new-style suffixes 1031 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 1032 s/\],#[0-9]+/]!/o; 1033 1034 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 1035 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 1036 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 1037 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 1038 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1039 s/^(\s+)b\./$1b/o or 1040 s/^(\s+)mov\./$1mov/o or 1041 s/^(\s+)ret/$1bx\tlr/o; 1042 1043 print $_,"\n"; 1044 } 1045} 1046 1047close STDOUT or die "error closing STDOUT"; 1048