1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# 39# (*) original 3.64/1.34/1.32 results were for r0p0 revision 40# and are still same even for updated module; 41 42$flavour = shift; 43$output = shift; 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 48die "can't locate arm-xlate.pl"; 49 50open OUT,"| \"$^X\" $xlate $flavour $output"; 51*STDOUT=*OUT; 52 53$prefix="aes_hw"; 54 55$code=<<___; 56#include <openssl/arm_arch.h> 57 58#if __ARM_MAX_ARCH__>=7 59.text 60___ 61$code.=<<___ if ($flavour =~ /64/); 62#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) 63.arch armv8-a+crypto 64#endif 65___ 66$code.=<<___ if ($flavour !~ /64/); 67.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 68.fpu neon 69.code 32 70#undef __thumb2__ 71___ 72 73# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 74# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 75# maintain both 32- and 64-bit codes within single module and 76# transliterate common code to either flavour with regex vodoo. 77# 78{{{ 79my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 80my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 81 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 82 83 84$code.=<<___; 85.align 5 86.Lrcon: 87.long 0x01,0x01,0x01,0x01 88.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 89.long 0x1b,0x1b,0x1b,0x1b 90 91.globl ${prefix}_set_encrypt_key 92.type ${prefix}_set_encrypt_key,%function 93.align 5 94${prefix}_set_encrypt_key: 95.Lenc_key: 96___ 97$code.=<<___ if ($flavour =~ /64/); 98 stp x29,x30,[sp,#-16]! 99 add x29,sp,#0 100___ 101$code.=<<___; 102 mov $ptr,#-1 103 cmp $inp,#0 104 b.eq .Lenc_key_abort 105 cmp $out,#0 106 b.eq .Lenc_key_abort 107 mov $ptr,#-2 108 cmp $bits,#128 109 b.lt .Lenc_key_abort 110 cmp $bits,#256 111 b.gt .Lenc_key_abort 112 tst $bits,#0x3f 113 b.ne .Lenc_key_abort 114 115 adr $ptr,.Lrcon 116 cmp $bits,#192 117 118 veor $zero,$zero,$zero 119 vld1.8 {$in0},[$inp],#16 120 mov $bits,#8 // reuse $bits 121 vld1.32 {$rcon,$mask},[$ptr],#32 122 123 b.lt .Loop128 124 b.eq .L192 125 b .L256 126 127.align 4 128.Loop128: 129 vtbl.8 $key,{$in0},$mask 130 vext.8 $tmp,$zero,$in0,#12 131 vst1.32 {$in0},[$out],#16 132 aese $key,$zero 133 subs $bits,$bits,#1 134 135 veor $in0,$in0,$tmp 136 vext.8 $tmp,$zero,$tmp,#12 137 veor $in0,$in0,$tmp 138 vext.8 $tmp,$zero,$tmp,#12 139 veor $key,$key,$rcon 140 veor $in0,$in0,$tmp 141 vshl.u8 $rcon,$rcon,#1 142 veor $in0,$in0,$key 143 b.ne .Loop128 144 145 vld1.32 {$rcon},[$ptr] 146 147 vtbl.8 $key,{$in0},$mask 148 vext.8 $tmp,$zero,$in0,#12 149 vst1.32 {$in0},[$out],#16 150 aese $key,$zero 151 152 veor $in0,$in0,$tmp 153 vext.8 $tmp,$zero,$tmp,#12 154 veor $in0,$in0,$tmp 155 vext.8 $tmp,$zero,$tmp,#12 156 veor $key,$key,$rcon 157 veor $in0,$in0,$tmp 158 vshl.u8 $rcon,$rcon,#1 159 veor $in0,$in0,$key 160 161 vtbl.8 $key,{$in0},$mask 162 vext.8 $tmp,$zero,$in0,#12 163 vst1.32 {$in0},[$out],#16 164 aese $key,$zero 165 166 veor $in0,$in0,$tmp 167 vext.8 $tmp,$zero,$tmp,#12 168 veor $in0,$in0,$tmp 169 vext.8 $tmp,$zero,$tmp,#12 170 veor $key,$key,$rcon 171 veor $in0,$in0,$tmp 172 veor $in0,$in0,$key 173 vst1.32 {$in0},[$out] 174 add $out,$out,#0x50 175 176 mov $rounds,#10 177 b .Ldone 178 179.align 4 180.L192: 181 vld1.8 {$in1},[$inp],#8 182 vmov.i8 $key,#8 // borrow $key 183 vst1.32 {$in0},[$out],#16 184 vsub.i8 $mask,$mask,$key // adjust the mask 185 186.Loop192: 187 vtbl.8 $key,{$in1},$mask 188 vext.8 $tmp,$zero,$in0,#12 189 vst1.32 {$in1},[$out],#8 190 aese $key,$zero 191 subs $bits,$bits,#1 192 193 veor $in0,$in0,$tmp 194 vext.8 $tmp,$zero,$tmp,#12 195 veor $in0,$in0,$tmp 196 vext.8 $tmp,$zero,$tmp,#12 197 veor $in0,$in0,$tmp 198 199 vdup.32 $tmp,${in0}[3] 200 veor $tmp,$tmp,$in1 201 veor $key,$key,$rcon 202 vext.8 $in1,$zero,$in1,#12 203 vshl.u8 $rcon,$rcon,#1 204 veor $in1,$in1,$tmp 205 veor $in0,$in0,$key 206 veor $in1,$in1,$key 207 vst1.32 {$in0},[$out],#16 208 b.ne .Loop192 209 210 mov $rounds,#12 211 add $out,$out,#0x20 212 b .Ldone 213 214.align 4 215.L256: 216 vld1.8 {$in1},[$inp] 217 mov $bits,#7 218 mov $rounds,#14 219 vst1.32 {$in0},[$out],#16 220 221.Loop256: 222 vtbl.8 $key,{$in1},$mask 223 vext.8 $tmp,$zero,$in0,#12 224 vst1.32 {$in1},[$out],#16 225 aese $key,$zero 226 subs $bits,$bits,#1 227 228 veor $in0,$in0,$tmp 229 vext.8 $tmp,$zero,$tmp,#12 230 veor $in0,$in0,$tmp 231 vext.8 $tmp,$zero,$tmp,#12 232 veor $key,$key,$rcon 233 veor $in0,$in0,$tmp 234 vshl.u8 $rcon,$rcon,#1 235 veor $in0,$in0,$key 236 vst1.32 {$in0},[$out],#16 237 b.eq .Ldone 238 239 vdup.32 $key,${in0}[3] // just splat 240 vext.8 $tmp,$zero,$in1,#12 241 aese $key,$zero 242 243 veor $in1,$in1,$tmp 244 vext.8 $tmp,$zero,$tmp,#12 245 veor $in1,$in1,$tmp 246 vext.8 $tmp,$zero,$tmp,#12 247 veor $in1,$in1,$tmp 248 249 veor $in1,$in1,$key 250 b .Loop256 251 252.Ldone: 253 str $rounds,[$out] 254 mov $ptr,#0 255 256.Lenc_key_abort: 257 mov x0,$ptr // return value 258 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 259 ret 260.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 261 262.globl ${prefix}_set_decrypt_key 263.type ${prefix}_set_decrypt_key,%function 264.align 5 265${prefix}_set_decrypt_key: 266___ 267$code.=<<___ if ($flavour =~ /64/); 268 stp x29,x30,[sp,#-16]! 269 add x29,sp,#0 270___ 271$code.=<<___ if ($flavour !~ /64/); 272 stmdb sp!,{r4,lr} 273___ 274$code.=<<___; 275 bl .Lenc_key 276 277 cmp x0,#0 278 b.ne .Ldec_key_abort 279 280 sub $out,$out,#240 // restore original $out 281 mov x4,#-16 282 add $inp,$out,x12,lsl#4 // end of key schedule 283 284 vld1.32 {v0.16b},[$out] 285 vld1.32 {v1.16b},[$inp] 286 vst1.32 {v0.16b},[$inp],x4 287 vst1.32 {v1.16b},[$out],#16 288 289.Loop_imc: 290 vld1.32 {v0.16b},[$out] 291 vld1.32 {v1.16b},[$inp] 292 aesimc v0.16b,v0.16b 293 aesimc v1.16b,v1.16b 294 vst1.32 {v0.16b},[$inp],x4 295 vst1.32 {v1.16b},[$out],#16 296 cmp $inp,$out 297 b.hi .Loop_imc 298 299 vld1.32 {v0.16b},[$out] 300 aesimc v0.16b,v0.16b 301 vst1.32 {v0.16b},[$inp] 302 303 eor x0,x0,x0 // return value 304.Ldec_key_abort: 305___ 306$code.=<<___ if ($flavour !~ /64/); 307 ldmia sp!,{r4,pc} 308___ 309$code.=<<___ if ($flavour =~ /64/); 310 ldp x29,x30,[sp],#16 311 ret 312___ 313$code.=<<___; 314.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 315___ 316}}} 317{{{ 318sub gen_block () { 319my $dir = shift; 320my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 321my ($inp,$out,$key)=map("x$_",(0..2)); 322my $rounds="w3"; 323my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 324 325$code.=<<___; 326.globl ${prefix}_${dir}crypt 327.type ${prefix}_${dir}crypt,%function 328.align 5 329${prefix}_${dir}crypt: 330 ldr $rounds,[$key,#240] 331 vld1.32 {$rndkey0},[$key],#16 332 vld1.8 {$inout},[$inp] 333 sub $rounds,$rounds,#2 334 vld1.32 {$rndkey1},[$key],#16 335 336.Loop_${dir}c: 337 aes$e $inout,$rndkey0 338 aes$mc $inout,$inout 339 vld1.32 {$rndkey0},[$key],#16 340 subs $rounds,$rounds,#2 341 aes$e $inout,$rndkey1 342 aes$mc $inout,$inout 343 vld1.32 {$rndkey1},[$key],#16 344 b.gt .Loop_${dir}c 345 346 aes$e $inout,$rndkey0 347 aes$mc $inout,$inout 348 vld1.32 {$rndkey0},[$key] 349 aes$e $inout,$rndkey1 350 veor $inout,$inout,$rndkey0 351 352 vst1.8 {$inout},[$out] 353 ret 354.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 355___ 356} 357&gen_block("en"); 358&gen_block("de"); 359}}} 360{{{ 361my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 362my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 363my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 364 365my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 366my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 367 368### q8-q15 preloaded key schedule 369 370$code.=<<___; 371.globl ${prefix}_cbc_encrypt 372.type ${prefix}_cbc_encrypt,%function 373.align 5 374${prefix}_cbc_encrypt: 375___ 376$code.=<<___ if ($flavour =~ /64/); 377 stp x29,x30,[sp,#-16]! 378 add x29,sp,#0 379___ 380$code.=<<___ if ($flavour !~ /64/); 381 mov ip,sp 382 stmdb sp!,{r4-r8,lr} 383 vstmdb sp!,{d8-d15} @ ABI specification says so 384 ldmia ip,{r4-r5} @ load remaining args 385___ 386$code.=<<___; 387 subs $len,$len,#16 388 mov $step,#16 389 b.lo .Lcbc_abort 390 cclr $step,eq 391 392 cmp $enc,#0 // en- or decrypting? 393 ldr $rounds,[$key,#240] 394 and $len,$len,#-16 395 vld1.8 {$ivec},[$ivp] 396 vld1.8 {$dat},[$inp],$step 397 398 vld1.32 {q8-q9},[$key] // load key schedule... 399 sub $rounds,$rounds,#6 400 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 401 sub $rounds,$rounds,#2 402 vld1.32 {q10-q11},[$key_],#32 403 vld1.32 {q12-q13},[$key_],#32 404 vld1.32 {q14-q15},[$key_],#32 405 vld1.32 {$rndlast},[$key_] 406 407 add $key_,$key,#32 408 mov $cnt,$rounds 409 b.eq .Lcbc_dec 410 411 cmp $rounds,#2 412 veor $dat,$dat,$ivec 413 veor $rndzero_n_last,q8,$rndlast 414 b.eq .Lcbc_enc128 415 416 vld1.32 {$in0-$in1},[$key_] 417 add $key_,$key,#16 418 add $key4,$key,#16*4 419 add $key5,$key,#16*5 420 aese $dat,q8 421 aesmc $dat,$dat 422 add $key6,$key,#16*6 423 add $key7,$key,#16*7 424 b .Lenter_cbc_enc 425 426.align 4 427.Loop_cbc_enc: 428 aese $dat,q8 429 aesmc $dat,$dat 430 vst1.8 {$ivec},[$out],#16 431.Lenter_cbc_enc: 432 aese $dat,q9 433 aesmc $dat,$dat 434 aese $dat,$in0 435 aesmc $dat,$dat 436 vld1.32 {q8},[$key4] 437 cmp $rounds,#4 438 aese $dat,$in1 439 aesmc $dat,$dat 440 vld1.32 {q9},[$key5] 441 b.eq .Lcbc_enc192 442 443 aese $dat,q8 444 aesmc $dat,$dat 445 vld1.32 {q8},[$key6] 446 aese $dat,q9 447 aesmc $dat,$dat 448 vld1.32 {q9},[$key7] 449 nop 450 451.Lcbc_enc192: 452 aese $dat,q8 453 aesmc $dat,$dat 454 subs $len,$len,#16 455 aese $dat,q9 456 aesmc $dat,$dat 457 cclr $step,eq 458 aese $dat,q10 459 aesmc $dat,$dat 460 aese $dat,q11 461 aesmc $dat,$dat 462 vld1.8 {q8},[$inp],$step 463 aese $dat,q12 464 aesmc $dat,$dat 465 veor q8,q8,$rndzero_n_last 466 aese $dat,q13 467 aesmc $dat,$dat 468 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 469 aese $dat,q14 470 aesmc $dat,$dat 471 aese $dat,q15 472 veor $ivec,$dat,$rndlast 473 b.hs .Loop_cbc_enc 474 475 vst1.8 {$ivec},[$out],#16 476 b .Lcbc_done 477 478.align 5 479.Lcbc_enc128: 480 vld1.32 {$in0-$in1},[$key_] 481 aese $dat,q8 482 aesmc $dat,$dat 483 b .Lenter_cbc_enc128 484.Loop_cbc_enc128: 485 aese $dat,q8 486 aesmc $dat,$dat 487 vst1.8 {$ivec},[$out],#16 488.Lenter_cbc_enc128: 489 aese $dat,q9 490 aesmc $dat,$dat 491 subs $len,$len,#16 492 aese $dat,$in0 493 aesmc $dat,$dat 494 cclr $step,eq 495 aese $dat,$in1 496 aesmc $dat,$dat 497 aese $dat,q10 498 aesmc $dat,$dat 499 aese $dat,q11 500 aesmc $dat,$dat 501 vld1.8 {q8},[$inp],$step 502 aese $dat,q12 503 aesmc $dat,$dat 504 aese $dat,q13 505 aesmc $dat,$dat 506 aese $dat,q14 507 aesmc $dat,$dat 508 veor q8,q8,$rndzero_n_last 509 aese $dat,q15 510 veor $ivec,$dat,$rndlast 511 b.hs .Loop_cbc_enc128 512 513 vst1.8 {$ivec},[$out],#16 514 b .Lcbc_done 515___ 516{ 517my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 518$code.=<<___; 519.align 5 520.Lcbc_dec: 521 vld1.8 {$dat2},[$inp],#16 522 subs $len,$len,#32 // bias 523 add $cnt,$rounds,#2 524 vorr $in1,$dat,$dat 525 vorr $dat1,$dat,$dat 526 vorr $in2,$dat2,$dat2 527 b.lo .Lcbc_dec_tail 528 529 vorr $dat1,$dat2,$dat2 530 vld1.8 {$dat2},[$inp],#16 531 vorr $in0,$dat,$dat 532 vorr $in1,$dat1,$dat1 533 vorr $in2,$dat2,$dat2 534 535.Loop3x_cbc_dec: 536 aesd $dat0,q8 537 aesimc $dat0,$dat0 538 aesd $dat1,q8 539 aesimc $dat1,$dat1 540 aesd $dat2,q8 541 aesimc $dat2,$dat2 542 vld1.32 {q8},[$key_],#16 543 subs $cnt,$cnt,#2 544 aesd $dat0,q9 545 aesimc $dat0,$dat0 546 aesd $dat1,q9 547 aesimc $dat1,$dat1 548 aesd $dat2,q9 549 aesimc $dat2,$dat2 550 vld1.32 {q9},[$key_],#16 551 b.gt .Loop3x_cbc_dec 552 553 aesd $dat0,q8 554 aesimc $dat0,$dat0 555 aesd $dat1,q8 556 aesimc $dat1,$dat1 557 aesd $dat2,q8 558 aesimc $dat2,$dat2 559 veor $tmp0,$ivec,$rndlast 560 subs $len,$len,#0x30 561 veor $tmp1,$in0,$rndlast 562 mov.lo x6,$len // x6, $cnt, is zero at this point 563 aesd $dat0,q9 564 aesimc $dat0,$dat0 565 aesd $dat1,q9 566 aesimc $dat1,$dat1 567 aesd $dat2,q9 568 aesimc $dat2,$dat2 569 veor $tmp2,$in1,$rndlast 570 add $inp,$inp,x6 // $inp is adjusted in such way that 571 // at exit from the loop $dat1-$dat2 572 // are loaded with last "words" 573 vorr $ivec,$in2,$in2 574 mov $key_,$key 575 aesd $dat0,q12 576 aesimc $dat0,$dat0 577 aesd $dat1,q12 578 aesimc $dat1,$dat1 579 aesd $dat2,q12 580 aesimc $dat2,$dat2 581 vld1.8 {$in0},[$inp],#16 582 aesd $dat0,q13 583 aesimc $dat0,$dat0 584 aesd $dat1,q13 585 aesimc $dat1,$dat1 586 aesd $dat2,q13 587 aesimc $dat2,$dat2 588 vld1.8 {$in1},[$inp],#16 589 aesd $dat0,q14 590 aesimc $dat0,$dat0 591 aesd $dat1,q14 592 aesimc $dat1,$dat1 593 aesd $dat2,q14 594 aesimc $dat2,$dat2 595 vld1.8 {$in2},[$inp],#16 596 aesd $dat0,q15 597 aesd $dat1,q15 598 aesd $dat2,q15 599 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 600 add $cnt,$rounds,#2 601 veor $tmp0,$tmp0,$dat0 602 veor $tmp1,$tmp1,$dat1 603 veor $dat2,$dat2,$tmp2 604 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 605 vst1.8 {$tmp0},[$out],#16 606 vorr $dat0,$in0,$in0 607 vst1.8 {$tmp1},[$out],#16 608 vorr $dat1,$in1,$in1 609 vst1.8 {$dat2},[$out],#16 610 vorr $dat2,$in2,$in2 611 b.hs .Loop3x_cbc_dec 612 613 cmn $len,#0x30 614 b.eq .Lcbc_done 615 nop 616 617.Lcbc_dec_tail: 618 aesd $dat1,q8 619 aesimc $dat1,$dat1 620 aesd $dat2,q8 621 aesimc $dat2,$dat2 622 vld1.32 {q8},[$key_],#16 623 subs $cnt,$cnt,#2 624 aesd $dat1,q9 625 aesimc $dat1,$dat1 626 aesd $dat2,q9 627 aesimc $dat2,$dat2 628 vld1.32 {q9},[$key_],#16 629 b.gt .Lcbc_dec_tail 630 631 aesd $dat1,q8 632 aesimc $dat1,$dat1 633 aesd $dat2,q8 634 aesimc $dat2,$dat2 635 aesd $dat1,q9 636 aesimc $dat1,$dat1 637 aesd $dat2,q9 638 aesimc $dat2,$dat2 639 aesd $dat1,q12 640 aesimc $dat1,$dat1 641 aesd $dat2,q12 642 aesimc $dat2,$dat2 643 cmn $len,#0x20 644 aesd $dat1,q13 645 aesimc $dat1,$dat1 646 aesd $dat2,q13 647 aesimc $dat2,$dat2 648 veor $tmp1,$ivec,$rndlast 649 aesd $dat1,q14 650 aesimc $dat1,$dat1 651 aesd $dat2,q14 652 aesimc $dat2,$dat2 653 veor $tmp2,$in1,$rndlast 654 aesd $dat1,q15 655 aesd $dat2,q15 656 b.eq .Lcbc_dec_one 657 veor $tmp1,$tmp1,$dat1 658 veor $tmp2,$tmp2,$dat2 659 vorr $ivec,$in2,$in2 660 vst1.8 {$tmp1},[$out],#16 661 vst1.8 {$tmp2},[$out],#16 662 b .Lcbc_done 663 664.Lcbc_dec_one: 665 veor $tmp1,$tmp1,$dat2 666 vorr $ivec,$in2,$in2 667 vst1.8 {$tmp1},[$out],#16 668 669.Lcbc_done: 670 vst1.8 {$ivec},[$ivp] 671.Lcbc_abort: 672___ 673} 674$code.=<<___ if ($flavour !~ /64/); 675 vldmia sp!,{d8-d15} 676 ldmia sp!,{r4-r8,pc} 677___ 678$code.=<<___ if ($flavour =~ /64/); 679 ldr x29,[sp],#16 680 ret 681___ 682$code.=<<___; 683.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 684___ 685}}} 686{{{ 687my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 688my ($rounds,$cnt,$key_)=("w5","w6","x7"); 689my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 690my $step="x12"; # aliases with $tctr2 691 692my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 693my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 694 695my ($dat,$tmp)=($dat0,$tmp0); 696 697### q8-q15 preloaded key schedule 698 699$code.=<<___; 700.globl ${prefix}_ctr32_encrypt_blocks 701.type ${prefix}_ctr32_encrypt_blocks,%function 702.align 5 703${prefix}_ctr32_encrypt_blocks: 704___ 705$code.=<<___ if ($flavour =~ /64/); 706 stp x29,x30,[sp,#-16]! 707 add x29,sp,#0 708___ 709$code.=<<___ if ($flavour !~ /64/); 710 mov ip,sp 711 stmdb sp!,{r4-r10,lr} 712 vstmdb sp!,{d8-d15} @ ABI specification says so 713 ldr r4, [ip] @ load remaining arg 714___ 715$code.=<<___; 716 ldr $rounds,[$key,#240] 717 718 ldr $ctr, [$ivp, #12] 719 vld1.32 {$dat0},[$ivp] 720 721 vld1.32 {q8-q9},[$key] // load key schedule... 722 sub $rounds,$rounds,#4 723 mov $step,#16 724 cmp $len,#2 725 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 726 sub $rounds,$rounds,#2 727 vld1.32 {q12-q13},[$key_],#32 728 vld1.32 {q14-q15},[$key_],#32 729 vld1.32 {$rndlast},[$key_] 730 add $key_,$key,#32 731 mov $cnt,$rounds 732 cclr $step,lo 733#ifndef __ARMEB__ 734 rev $ctr, $ctr 735#endif 736 vorr $dat1,$dat0,$dat0 737 add $tctr1, $ctr, #1 738 vorr $dat2,$dat0,$dat0 739 add $ctr, $ctr, #2 740 vorr $ivec,$dat0,$dat0 741 rev $tctr1, $tctr1 742 vmov.32 ${dat1}[3],$tctr1 743 b.ls .Lctr32_tail 744 rev $tctr2, $ctr 745 sub $len,$len,#3 // bias 746 vmov.32 ${dat2}[3],$tctr2 747 b .Loop3x_ctr32 748 749.align 4 750.Loop3x_ctr32: 751 aese $dat0,q8 752 aesmc $dat0,$dat0 753 aese $dat1,q8 754 aesmc $dat1,$dat1 755 aese $dat2,q8 756 aesmc $dat2,$dat2 757 vld1.32 {q8},[$key_],#16 758 subs $cnt,$cnt,#2 759 aese $dat0,q9 760 aesmc $dat0,$dat0 761 aese $dat1,q9 762 aesmc $dat1,$dat1 763 aese $dat2,q9 764 aesmc $dat2,$dat2 765 vld1.32 {q9},[$key_],#16 766 b.gt .Loop3x_ctr32 767 768 aese $dat0,q8 769 aesmc $tmp0,$dat0 770 aese $dat1,q8 771 aesmc $tmp1,$dat1 772 vld1.8 {$in0},[$inp],#16 773 vorr $dat0,$ivec,$ivec 774 aese $dat2,q8 775 aesmc $dat2,$dat2 776 vld1.8 {$in1},[$inp],#16 777 vorr $dat1,$ivec,$ivec 778 aese $tmp0,q9 779 aesmc $tmp0,$tmp0 780 aese $tmp1,q9 781 aesmc $tmp1,$tmp1 782 vld1.8 {$in2},[$inp],#16 783 mov $key_,$key 784 aese $dat2,q9 785 aesmc $tmp2,$dat2 786 vorr $dat2,$ivec,$ivec 787 add $tctr0,$ctr,#1 788 aese $tmp0,q12 789 aesmc $tmp0,$tmp0 790 aese $tmp1,q12 791 aesmc $tmp1,$tmp1 792 veor $in0,$in0,$rndlast 793 add $tctr1,$ctr,#2 794 aese $tmp2,q12 795 aesmc $tmp2,$tmp2 796 veor $in1,$in1,$rndlast 797 add $ctr,$ctr,#3 798 aese $tmp0,q13 799 aesmc $tmp0,$tmp0 800 aese $tmp1,q13 801 aesmc $tmp1,$tmp1 802 veor $in2,$in2,$rndlast 803 rev $tctr0,$tctr0 804 aese $tmp2,q13 805 aesmc $tmp2,$tmp2 806 vmov.32 ${dat0}[3], $tctr0 807 rev $tctr1,$tctr1 808 aese $tmp0,q14 809 aesmc $tmp0,$tmp0 810 aese $tmp1,q14 811 aesmc $tmp1,$tmp1 812 vmov.32 ${dat1}[3], $tctr1 813 rev $tctr2,$ctr 814 aese $tmp2,q14 815 aesmc $tmp2,$tmp2 816 vmov.32 ${dat2}[3], $tctr2 817 subs $len,$len,#3 818 aese $tmp0,q15 819 aese $tmp1,q15 820 aese $tmp2,q15 821 822 veor $in0,$in0,$tmp0 823 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 824 vst1.8 {$in0},[$out],#16 825 veor $in1,$in1,$tmp1 826 mov $cnt,$rounds 827 vst1.8 {$in1},[$out],#16 828 veor $in2,$in2,$tmp2 829 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 830 vst1.8 {$in2},[$out],#16 831 b.hs .Loop3x_ctr32 832 833 adds $len,$len,#3 834 b.eq .Lctr32_done 835 cmp $len,#1 836 mov $step,#16 837 cclr $step,eq 838 839.Lctr32_tail: 840 aese $dat0,q8 841 aesmc $dat0,$dat0 842 aese $dat1,q8 843 aesmc $dat1,$dat1 844 vld1.32 {q8},[$key_],#16 845 subs $cnt,$cnt,#2 846 aese $dat0,q9 847 aesmc $dat0,$dat0 848 aese $dat1,q9 849 aesmc $dat1,$dat1 850 vld1.32 {q9},[$key_],#16 851 b.gt .Lctr32_tail 852 853 aese $dat0,q8 854 aesmc $dat0,$dat0 855 aese $dat1,q8 856 aesmc $dat1,$dat1 857 aese $dat0,q9 858 aesmc $dat0,$dat0 859 aese $dat1,q9 860 aesmc $dat1,$dat1 861 vld1.8 {$in0},[$inp],$step 862 aese $dat0,q12 863 aesmc $dat0,$dat0 864 aese $dat1,q12 865 aesmc $dat1,$dat1 866 vld1.8 {$in1},[$inp] 867 aese $dat0,q13 868 aesmc $dat0,$dat0 869 aese $dat1,q13 870 aesmc $dat1,$dat1 871 veor $in0,$in0,$rndlast 872 aese $dat0,q14 873 aesmc $dat0,$dat0 874 aese $dat1,q14 875 aesmc $dat1,$dat1 876 veor $in1,$in1,$rndlast 877 aese $dat0,q15 878 aese $dat1,q15 879 880 cmp $len,#1 881 veor $in0,$in0,$dat0 882 veor $in1,$in1,$dat1 883 vst1.8 {$in0},[$out],#16 884 b.eq .Lctr32_done 885 vst1.8 {$in1},[$out] 886 887.Lctr32_done: 888___ 889$code.=<<___ if ($flavour !~ /64/); 890 vldmia sp!,{d8-d15} 891 ldmia sp!,{r4-r10,pc} 892___ 893$code.=<<___ if ($flavour =~ /64/); 894 ldr x29,[sp],#16 895 ret 896___ 897$code.=<<___; 898.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 899___ 900}}} 901$code.=<<___; 902#endif 903___ 904######################################## 905if ($flavour =~ /64/) { ######## 64-bit code 906 my %opcode = ( 907 "aesd" => 0x4e285800, "aese" => 0x4e284800, 908 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 909 910 local *unaes = sub { 911 my ($mnemonic,$arg)=@_; 912 913 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 914 sprintf ".inst\t0x%08x\t//%s %s", 915 $opcode{$mnemonic}|$1|($2<<5), 916 $mnemonic,$arg; 917 }; 918 919 foreach(split("\n",$code)) { 920 s/\`([^\`]*)\`/eval($1)/geo; 921 922 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 923 s/@\s/\/\//o; # old->new style commentary 924 925 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 926 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 927 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 928 s/vmov\.i8/movi/o or # fix up legacy mnemonics 929 s/vext\.8/ext/o or 930 s/vrev32\.8/rev32/o or 931 s/vtst\.8/cmtst/o or 932 s/vshr/ushr/o or 933 s/^(\s+)v/$1/o or # strip off v prefix 934 s/\bbx\s+lr\b/ret/o; 935 936 # fix up remainig legacy suffixes 937 s/\.[ui]?8//o; 938 m/\],#8/o and s/\.16b/\.8b/go; 939 s/\.[ui]?32//o and s/\.16b/\.4s/go; 940 s/\.[ui]?64//o and s/\.16b/\.2d/go; 941 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 942 943 print $_,"\n"; 944 } 945} else { ######## 32-bit code 946 my %opcode = ( 947 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 948 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 949 950 local *unaes = sub { 951 my ($mnemonic,$arg)=@_; 952 953 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 954 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 955 |(($2&7)<<1) |(($2&8)<<2); 956 # since ARMv7 instructions are always encoded little-endian. 957 # correct solution is to use .inst directive, but older 958 # assemblers don't implement it:-( 959 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 960 $word&0xff,($word>>8)&0xff, 961 ($word>>16)&0xff,($word>>24)&0xff, 962 $mnemonic,$arg; 963 } 964 }; 965 966 sub unvtbl { 967 my $arg=shift; 968 969 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 970 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 971 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 972 } 973 974 sub unvdup32 { 975 my $arg=shift; 976 977 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 978 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 979 } 980 981 sub unvmov32 { 982 my $arg=shift; 983 984 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 985 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 986 } 987 988 foreach(split("\n",$code)) { 989 s/\`([^\`]*)\`/eval($1)/geo; 990 991 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 992 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 993 s/\/\/\s?/@ /o; # new->old style commentary 994 995 # fix up remainig new-style suffixes 996 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 997 s/\],#[0-9]+/]!/o; 998 999 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 1000 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 1001 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 1002 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 1003 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1004 s/^(\s+)b\./$1b/o or 1005 s/^(\s+)mov\./$1mov/o or 1006 s/^(\s+)ret/$1bx\tlr/o; 1007 1008 print $_,"\n"; 1009 } 1010} 1011 1012close STDOUT; 1013