1#! /usr/bin/env perl 2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2014 18# 19# ChaCha20 for ARMv4. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU 24# 25# Cortex-A5 19.3(*)/+95% 21.8 14.1 26# Cortex-A8 10.5(*)/+160% 13.9 6.35 27# Cortex-A9 12.9(**)/+110% 14.3 6.50 28# Cortex-A15 11.0/+40% 16.0 5.00 29# Snapdragon S4 11.5/+125% 13.6 4.90 30# 31# (*) most "favourable" result for aligned data on little-endian 32# processor, result for misaligned data is 10-15% lower; 33# (**) this result is a trade-off: it can be improved by 20%, 34# but then Snapdragon S4 and Cortex-A8 results get 35# 20-25% worse; 36 37$flavour = shift; 38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 40 41if ($flavour && $flavour ne "void") { 42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45 die "can't locate arm-xlate.pl"; 46 47 open OUT,"| \"$^X\" $xlate $flavour $output"; 48 *STDOUT=*OUT; 49} else { 50 open OUT,">$output"; 51 *STDOUT=*OUT; 52} 53 54sub AUTOLOAD() # thunk [simplified] x86-style perlasm 55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 56 my $arg = pop; 57 $arg = "#$arg" if ($arg*1 eq $arg); 58 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 59} 60 61my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); 62my @t=map("r$_",(8..11)); 63 64sub ROUND { 65my ($a0,$b0,$c0,$d0)=@_; 66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 69my $odd = $d0&1; 70my ($xc,$xc_) = (@t[0..1]); 71my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); 72my @ret; 73 74 # Consider order in which variables are addressed by their 75 # index: 76 # 77 # a b c d 78 # 79 # 0 4 8 12 < even round 80 # 1 5 9 13 81 # 2 6 10 14 82 # 3 7 11 15 83 # 0 5 10 15 < odd round 84 # 1 6 11 12 85 # 2 7 8 13 86 # 3 4 9 14 87 # 88 # 'a', 'b' are permanently allocated in registers, @x[0..7], 89 # while 'c's and pair of 'd's are maintained in memory. If 90 # you observe 'c' column, you'll notice that pair of 'c's is 91 # invariant between rounds. This means that we have to reload 92 # them once per round, in the middle. This is why you'll see 93 # bunch of 'c' stores and loads in the middle, but none in 94 # the beginning or end. If you observe 'd' column, you'll 95 # notice that 15 and 13 are reused in next pair of rounds. 96 # This is why these two are chosen for offloading to memory, 97 # to make loads count more. 98 push @ret,( 99 "&add (@x[$a0],@x[$a0],@x[$b0])", 100 "&mov ($xd,$xd,'ror#16')", 101 "&add (@x[$a1],@x[$a1],@x[$b1])", 102 "&mov ($xd_,$xd_,'ror#16')", 103 "&eor ($xd,$xd,@x[$a0],'ror#16')", 104 "&eor ($xd_,$xd_,@x[$a1],'ror#16')", 105 106 "&add ($xc,$xc,$xd)", 107 "&mov (@x[$b0],@x[$b0],'ror#20')", 108 "&add ($xc_,$xc_,$xd_)", 109 "&mov (@x[$b1],@x[$b1],'ror#20')", 110 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", 111 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", 112 113 "&add (@x[$a0],@x[$a0],@x[$b0])", 114 "&mov ($xd,$xd,'ror#24')", 115 "&add (@x[$a1],@x[$a1],@x[$b1])", 116 "&mov ($xd_,$xd_,'ror#24')", 117 "&eor ($xd,$xd,@x[$a0],'ror#24')", 118 "&eor ($xd_,$xd_,@x[$a1],'ror#24')", 119 120 "&add ($xc,$xc,$xd)", 121 "&mov (@x[$b0],@x[$b0],'ror#25')" ); 122 push @ret,( 123 "&str ($xd,'[sp,#4*(16+$d0)]')", 124 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); 125 push @ret,( 126 "&add ($xc_,$xc_,$xd_)", 127 "&mov (@x[$b1],@x[$b1],'ror#25')" ); 128 push @ret,( 129 "&str ($xd_,'[sp,#4*(16+$d1)]')", 130 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); 131 push @ret,( 132 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", 133 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); 134 135 $xd=@x[$d2] if (!$odd); 136 $xd_=@x[$d3] if ($odd); 137 push @ret,( 138 "&str ($xc,'[sp,#4*(16+$c0)]')", 139 "&ldr ($xc,'[sp,#4*(16+$c2)]')", 140 "&add (@x[$a2],@x[$a2],@x[$b2])", 141 "&mov ($xd,$xd,'ror#16')", 142 "&str ($xc_,'[sp,#4*(16+$c1)]')", 143 "&ldr ($xc_,'[sp,#4*(16+$c3)]')", 144 "&add (@x[$a3],@x[$a3],@x[$b3])", 145 "&mov ($xd_,$xd_,'ror#16')", 146 "&eor ($xd,$xd,@x[$a2],'ror#16')", 147 "&eor ($xd_,$xd_,@x[$a3],'ror#16')", 148 149 "&add ($xc,$xc,$xd)", 150 "&mov (@x[$b2],@x[$b2],'ror#20')", 151 "&add ($xc_,$xc_,$xd_)", 152 "&mov (@x[$b3],@x[$b3],'ror#20')", 153 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", 154 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", 155 156 "&add (@x[$a2],@x[$a2],@x[$b2])", 157 "&mov ($xd,$xd,'ror#24')", 158 "&add (@x[$a3],@x[$a3],@x[$b3])", 159 "&mov ($xd_,$xd_,'ror#24')", 160 "&eor ($xd,$xd,@x[$a2],'ror#24')", 161 "&eor ($xd_,$xd_,@x[$a3],'ror#24')", 162 163 "&add ($xc,$xc,$xd)", 164 "&mov (@x[$b2],@x[$b2],'ror#25')", 165 "&add ($xc_,$xc_,$xd_)", 166 "&mov (@x[$b3],@x[$b3],'ror#25')", 167 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", 168 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); 169 170 @ret; 171} 172 173$code.=<<___; 174#include <openssl/arm_arch.h> 175 176@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 177@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. 178.arch armv7-a 179 180.text 181#if defined(__thumb2__) || defined(__clang__) 182.syntax unified 183#endif 184#if defined(__thumb2__) 185.thumb 186#else 187.code 32 188#endif 189 190#if defined(__thumb2__) || defined(__clang__) 191#define ldrhsb ldrbhs 192#endif 193 194.align 5 195.Lsigma: 196.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 197.Lone: 198.long 1,0,0,0 199#if __ARM_MAX_ARCH__>=7 200.LOPENSSL_armcap: 201.word OPENSSL_armcap_P-.LChaCha20_ctr32 202#else 203.word -1 204#endif 205 206.globl ChaCha20_ctr32 207.type ChaCha20_ctr32,%function 208.align 5 209ChaCha20_ctr32: 210.LChaCha20_ctr32: 211 ldr r12,[sp,#0] @ pull pointer to counter and nonce 212 stmdb sp!,{r0-r2,r4-r11,lr} 213#if __ARM_ARCH__<7 && !defined(__thumb2__) 214 sub r14,pc,#16 @ ChaCha20_ctr32 215#else 216 adr r14,.LChaCha20_ctr32 217#endif 218 cmp r2,#0 @ len==0? 219#ifdef __thumb2__ 220 itt eq 221#endif 222 addeq sp,sp,#4*3 223 beq .Lno_data 224#if __ARM_MAX_ARCH__>=7 225 cmp r2,#192 @ test len 226 bls .Lshort 227 ldr r4,[r14,#-32] 228 ldr r4,[r14,r4] 229# ifdef __APPLE__ 230 ldr r4,[r4] 231# endif 232 tst r4,#ARMV7_NEON 233 bne .LChaCha20_neon 234.Lshort: 235#endif 236 ldmia r12,{r4-r7} @ load counter and nonce 237 sub sp,sp,#4*(16) @ off-load area 238 sub r14,r14,#64 @ .Lsigma 239 stmdb sp!,{r4-r7} @ copy counter and nonce 240 ldmia r3,{r4-r11} @ load key 241 ldmia r14,{r0-r3} @ load sigma 242 stmdb sp!,{r4-r11} @ copy key 243 stmdb sp!,{r0-r3} @ copy sigma 244 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 245 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 246 b .Loop_outer_enter 247 248.align 4 249.Loop_outer: 250 ldmia sp,{r0-r9} @ load key material 251 str @t[3],[sp,#4*(32+2)] @ save len 252 str r12, [sp,#4*(32+1)] @ save inp 253 str r14, [sp,#4*(32+0)] @ save out 254.Loop_outer_enter: 255 ldr @t[3], [sp,#4*(15)] 256 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 257 ldr @t[2], [sp,#4*(13)] 258 ldr @x[14],[sp,#4*(14)] 259 str @t[3], [sp,#4*(16+15)] 260 mov @t[3],#10 261 b .Loop 262 263.align 4 264.Loop: 265 subs @t[3],@t[3],#1 266___ 267 foreach (&ROUND(0, 4, 8,12)) { eval; } 268 foreach (&ROUND(0, 5,10,15)) { eval; } 269$code.=<<___; 270 bne .Loop 271 272 ldr @t[3],[sp,#4*(32+2)] @ load len 273 274 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 275 str @t[1], [sp,#4*(16+9)] 276 str @x[12],[sp,#4*(16+12)] 277 str @t[2], [sp,#4*(16+13)] 278 str @x[14],[sp,#4*(16+14)] 279 280 @ at this point we have first half of 512-bit result in 281 @ @x[0-7] and second half at sp+4*(16+8) 282 283 cmp @t[3],#64 @ done yet? 284#ifdef __thumb2__ 285 itete lo 286#endif 287 addlo r12,sp,#4*(0) @ shortcut or ... 288 ldrhs r12,[sp,#4*(32+1)] @ ... load inp 289 addlo r14,sp,#4*(0) @ shortcut or ... 290 ldrhs r14,[sp,#4*(32+0)] @ ... load out 291 292 ldr @t[0],[sp,#4*(0)] @ load key material 293 ldr @t[1],[sp,#4*(1)] 294 295#if __ARM_ARCH__>=6 || !defined(__ARMEB__) 296# if __ARM_ARCH__<7 297 orr @t[2],r12,r14 298 tst @t[2],#3 @ are input and output aligned? 299 ldr @t[2],[sp,#4*(2)] 300 bne .Lunaligned 301 cmp @t[3],#64 @ restore flags 302# else 303 ldr @t[2],[sp,#4*(2)] 304# endif 305 ldr @t[3],[sp,#4*(3)] 306 307 add @x[0],@x[0],@t[0] @ accumulate key material 308 add @x[1],@x[1],@t[1] 309# ifdef __thumb2__ 310 itt hs 311# endif 312 ldrhs @t[0],[r12],#16 @ load input 313 ldrhs @t[1],[r12,#-12] 314 315 add @x[2],@x[2],@t[2] 316 add @x[3],@x[3],@t[3] 317# ifdef __thumb2__ 318 itt hs 319# endif 320 ldrhs @t[2],[r12,#-8] 321 ldrhs @t[3],[r12,#-4] 322# if __ARM_ARCH__>=6 && defined(__ARMEB__) 323 rev @x[0],@x[0] 324 rev @x[1],@x[1] 325 rev @x[2],@x[2] 326 rev @x[3],@x[3] 327# endif 328# ifdef __thumb2__ 329 itt hs 330# endif 331 eorhs @x[0],@x[0],@t[0] @ xor with input 332 eorhs @x[1],@x[1],@t[1] 333 add @t[0],sp,#4*(4) 334 str @x[0],[r14],#16 @ store output 335# ifdef __thumb2__ 336 itt hs 337# endif 338 eorhs @x[2],@x[2],@t[2] 339 eorhs @x[3],@x[3],@t[3] 340 ldmia @t[0],{@t[0]-@t[3]} @ load key material 341 str @x[1],[r14,#-12] 342 str @x[2],[r14,#-8] 343 str @x[3],[r14,#-4] 344 345 add @x[4],@x[4],@t[0] @ accumulate key material 346 add @x[5],@x[5],@t[1] 347# ifdef __thumb2__ 348 itt hs 349# endif 350 ldrhs @t[0],[r12],#16 @ load input 351 ldrhs @t[1],[r12,#-12] 352 add @x[6],@x[6],@t[2] 353 add @x[7],@x[7],@t[3] 354# ifdef __thumb2__ 355 itt hs 356# endif 357 ldrhs @t[2],[r12,#-8] 358 ldrhs @t[3],[r12,#-4] 359# if __ARM_ARCH__>=6 && defined(__ARMEB__) 360 rev @x[4],@x[4] 361 rev @x[5],@x[5] 362 rev @x[6],@x[6] 363 rev @x[7],@x[7] 364# endif 365# ifdef __thumb2__ 366 itt hs 367# endif 368 eorhs @x[4],@x[4],@t[0] 369 eorhs @x[5],@x[5],@t[1] 370 add @t[0],sp,#4*(8) 371 str @x[4],[r14],#16 @ store output 372# ifdef __thumb2__ 373 itt hs 374# endif 375 eorhs @x[6],@x[6],@t[2] 376 eorhs @x[7],@x[7],@t[3] 377 str @x[5],[r14,#-12] 378 ldmia @t[0],{@t[0]-@t[3]} @ load key material 379 str @x[6],[r14,#-8] 380 add @x[0],sp,#4*(16+8) 381 str @x[7],[r14,#-4] 382 383 ldmia @x[0],{@x[0]-@x[7]} @ load second half 384 385 add @x[0],@x[0],@t[0] @ accumulate key material 386 add @x[1],@x[1],@t[1] 387# ifdef __thumb2__ 388 itt hs 389# endif 390 ldrhs @t[0],[r12],#16 @ load input 391 ldrhs @t[1],[r12,#-12] 392# ifdef __thumb2__ 393 itt hi 394# endif 395 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 396 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 397 add @x[2],@x[2],@t[2] 398 add @x[3],@x[3],@t[3] 399# ifdef __thumb2__ 400 itt hs 401# endif 402 ldrhs @t[2],[r12,#-8] 403 ldrhs @t[3],[r12,#-4] 404# if __ARM_ARCH__>=6 && defined(__ARMEB__) 405 rev @x[0],@x[0] 406 rev @x[1],@x[1] 407 rev @x[2],@x[2] 408 rev @x[3],@x[3] 409# endif 410# ifdef __thumb2__ 411 itt hs 412# endif 413 eorhs @x[0],@x[0],@t[0] 414 eorhs @x[1],@x[1],@t[1] 415 add @t[0],sp,#4*(12) 416 str @x[0],[r14],#16 @ store output 417# ifdef __thumb2__ 418 itt hs 419# endif 420 eorhs @x[2],@x[2],@t[2] 421 eorhs @x[3],@x[3],@t[3] 422 str @x[1],[r14,#-12] 423 ldmia @t[0],{@t[0]-@t[3]} @ load key material 424 str @x[2],[r14,#-8] 425 str @x[3],[r14,#-4] 426 427 add @x[4],@x[4],@t[0] @ accumulate key material 428 add @x[5],@x[5],@t[1] 429# ifdef __thumb2__ 430 itt hi 431# endif 432 addhi @t[0],@t[0],#1 @ next counter value 433 strhi @t[0],[sp,#4*(12)] @ save next counter value 434# ifdef __thumb2__ 435 itt hs 436# endif 437 ldrhs @t[0],[r12],#16 @ load input 438 ldrhs @t[1],[r12,#-12] 439 add @x[6],@x[6],@t[2] 440 add @x[7],@x[7],@t[3] 441# ifdef __thumb2__ 442 itt hs 443# endif 444 ldrhs @t[2],[r12,#-8] 445 ldrhs @t[3],[r12,#-4] 446# if __ARM_ARCH__>=6 && defined(__ARMEB__) 447 rev @x[4],@x[4] 448 rev @x[5],@x[5] 449 rev @x[6],@x[6] 450 rev @x[7],@x[7] 451# endif 452# ifdef __thumb2__ 453 itt hs 454# endif 455 eorhs @x[4],@x[4],@t[0] 456 eorhs @x[5],@x[5],@t[1] 457# ifdef __thumb2__ 458 it ne 459# endif 460 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 461# ifdef __thumb2__ 462 itt hs 463# endif 464 eorhs @x[6],@x[6],@t[2] 465 eorhs @x[7],@x[7],@t[3] 466 str @x[4],[r14],#16 @ store output 467 str @x[5],[r14,#-12] 468# ifdef __thumb2__ 469 it hs 470# endif 471 subhs @t[3],@t[0],#64 @ len-=64 472 str @x[6],[r14,#-8] 473 str @x[7],[r14,#-4] 474 bhi .Loop_outer 475 476 beq .Ldone 477# if __ARM_ARCH__<7 478 b .Ltail 479 480.align 4 481.Lunaligned: @ unaligned endian-neutral path 482 cmp @t[3],#64 @ restore flags 483# endif 484#endif 485#if __ARM_ARCH__<7 486 ldr @t[3],[sp,#4*(3)] 487___ 488for ($i=0;$i<16;$i+=4) { 489my $j=$i&0x7; 490 491$code.=<<___ if ($i==4); 492 add @x[0],sp,#4*(16+8) 493___ 494$code.=<<___ if ($i==8); 495 ldmia @x[0],{@x[0]-@x[7]} @ load second half 496# ifdef __thumb2__ 497 itt hi 498# endif 499 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" 500 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" 501___ 502$code.=<<___; 503 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material 504___ 505$code.=<<___ if ($i==12); 506# ifdef __thumb2__ 507 itt hi 508# endif 509 addhi @t[0],@t[0],#1 @ next counter value 510 strhi @t[0],[sp,#4*(12)] @ save next counter value 511___ 512$code.=<<___; 513 add @x[$j+1],@x[$j+1],@t[1] 514 add @x[$j+2],@x[$j+2],@t[2] 515# ifdef __thumb2__ 516 itete lo 517# endif 518 eorlo @t[0],@t[0],@t[0] @ zero or ... 519 ldrhsb @t[0],[r12],#16 @ ... load input 520 eorlo @t[1],@t[1],@t[1] 521 ldrhsb @t[1],[r12,#-12] 522 523 add @x[$j+3],@x[$j+3],@t[3] 524# ifdef __thumb2__ 525 itete lo 526# endif 527 eorlo @t[2],@t[2],@t[2] 528 ldrhsb @t[2],[r12,#-8] 529 eorlo @t[3],@t[3],@t[3] 530 ldrhsb @t[3],[r12,#-4] 531 532 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) 533 eor @x[$j+1],@t[1],@x[$j+1] 534# ifdef __thumb2__ 535 itt hs 536# endif 537 ldrhsb @t[0],[r12,#-15] @ load more input 538 ldrhsb @t[1],[r12,#-11] 539 eor @x[$j+2],@t[2],@x[$j+2] 540 strb @x[$j+0],[r14],#16 @ store output 541 eor @x[$j+3],@t[3],@x[$j+3] 542# ifdef __thumb2__ 543 itt hs 544# endif 545 ldrhsb @t[2],[r12,#-7] 546 ldrhsb @t[3],[r12,#-3] 547 strb @x[$j+1],[r14,#-12] 548 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 549 strb @x[$j+2],[r14,#-8] 550 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 551# ifdef __thumb2__ 552 itt hs 553# endif 554 ldrhsb @t[0],[r12,#-14] @ load more input 555 ldrhsb @t[1],[r12,#-10] 556 strb @x[$j+3],[r14,#-4] 557 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 558 strb @x[$j+0],[r14,#-15] 559 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 560# ifdef __thumb2__ 561 itt hs 562# endif 563 ldrhsb @t[2],[r12,#-6] 564 ldrhsb @t[3],[r12,#-2] 565 strb @x[$j+1],[r14,#-11] 566 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 567 strb @x[$j+2],[r14,#-7] 568 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 569# ifdef __thumb2__ 570 itt hs 571# endif 572 ldrhsb @t[0],[r12,#-13] @ load more input 573 ldrhsb @t[1],[r12,#-9] 574 strb @x[$j+3],[r14,#-3] 575 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 576 strb @x[$j+0],[r14,#-14] 577 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 578# ifdef __thumb2__ 579 itt hs 580# endif 581 ldrhsb @t[2],[r12,#-5] 582 ldrhsb @t[3],[r12,#-1] 583 strb @x[$j+1],[r14,#-10] 584 strb @x[$j+2],[r14,#-6] 585 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 586 strb @x[$j+3],[r14,#-2] 587 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 588 strb @x[$j+0],[r14,#-13] 589 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 590 strb @x[$j+1],[r14,#-9] 591 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 592 strb @x[$j+2],[r14,#-5] 593 strb @x[$j+3],[r14,#-1] 594___ 595$code.=<<___ if ($i<12); 596 add @t[0],sp,#4*(4+$i) 597 ldmia @t[0],{@t[0]-@t[3]} @ load key material 598___ 599} 600$code.=<<___; 601# ifdef __thumb2__ 602 it ne 603# endif 604 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 605# ifdef __thumb2__ 606 it hs 607# endif 608 subhs @t[3],@t[0],#64 @ len-=64 609 bhi .Loop_outer 610 611 beq .Ldone 612#endif 613 614.Ltail: 615 ldr r12,[sp,#4*(32+1)] @ load inp 616 add @t[1],sp,#4*(0) 617 ldr r14,[sp,#4*(32+0)] @ load out 618 619.Loop_tail: 620 ldrb @t[2],[@t[1]],#1 @ read buffer on stack 621 ldrb @t[3],[r12],#1 @ read input 622 subs @t[0],@t[0],#1 623 eor @t[3],@t[3],@t[2] 624 strb @t[3],[r14],#1 @ store output 625 bne .Loop_tail 626 627.Ldone: 628 add sp,sp,#4*(32+3) 629.Lno_data: 630 ldmia sp!,{r4-r11,pc} 631.size ChaCha20_ctr32,.-ChaCha20_ctr32 632___ 633 634{{{ 635my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = 636 map("q$_",(0..15)); 637 638sub NEONROUND { 639my $odd = pop; 640my ($a,$b,$c,$d,$t)=@_; 641 642 ( 643 "&vadd_i32 ($a,$a,$b)", 644 "&veor ($d,$d,$a)", 645 "&vrev32_16 ($d,$d)", # vrot ($d,16) 646 647 "&vadd_i32 ($c,$c,$d)", 648 "&veor ($t,$b,$c)", 649 "&vshr_u32 ($b,$t,20)", 650 "&vsli_32 ($b,$t,12)", 651 652 "&vadd_i32 ($a,$a,$b)", 653 "&veor ($t,$d,$a)", 654 "&vshr_u32 ($d,$t,24)", 655 "&vsli_32 ($d,$t,8)", 656 657 "&vadd_i32 ($c,$c,$d)", 658 "&veor ($t,$b,$c)", 659 "&vshr_u32 ($b,$t,25)", 660 "&vsli_32 ($b,$t,7)", 661 662 "&vext_8 ($c,$c,$c,8)", 663 "&vext_8 ($b,$b,$b,$odd?12:4)", 664 "&vext_8 ($d,$d,$d,$odd?4:12)" 665 ); 666} 667 668$code.=<<___; 669#if __ARM_MAX_ARCH__>=7 670.arch armv7-a 671.fpu neon 672 673.type ChaCha20_neon,%function 674.align 5 675ChaCha20_neon: 676 ldr r12,[sp,#0] @ pull pointer to counter and nonce 677 stmdb sp!,{r0-r2,r4-r11,lr} 678.LChaCha20_neon: 679 adr r14,.Lsigma 680 vstmdb sp!,{d8-d15} @ ABI spec says so 681 stmdb sp!,{r0-r3} 682 683 vld1.32 {$b0-$c0},[r3] @ load key 684 ldmia r3,{r4-r11} @ load key 685 686 sub sp,sp,#4*(16+16) 687 vld1.32 {$d0},[r12] @ load counter and nonce 688 add r12,sp,#4*8 689 ldmia r14,{r0-r3} @ load sigma 690 vld1.32 {$a0},[r14]! @ load sigma 691 vld1.32 {$t0},[r14] @ one 692 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce 693 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key 694 695 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 696 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 697 vshl.i32 $t1#lo,$t0#lo,#1 @ two 698 vstr $t0#lo,[sp,#4*(16+0)] 699 vshl.i32 $t2#lo,$t0#lo,#2 @ four 700 vstr $t1#lo,[sp,#4*(16+2)] 701 vmov $a1,$a0 702 vstr $t2#lo,[sp,#4*(16+4)] 703 vmov $a2,$a0 704 vmov $b1,$b0 705 vmov $b2,$b0 706 b .Loop_neon_enter 707 708.align 4 709.Loop_neon_outer: 710 ldmia sp,{r0-r9} @ load key material 711 cmp @t[3],#64*2 @ if len<=64*2 712 bls .Lbreak_neon @ switch to integer-only 713 vmov $a1,$a0 714 str @t[3],[sp,#4*(32+2)] @ save len 715 vmov $a2,$a0 716 str r12, [sp,#4*(32+1)] @ save inp 717 vmov $b1,$b0 718 str r14, [sp,#4*(32+0)] @ save out 719 vmov $b2,$b0 720.Loop_neon_enter: 721 ldr @t[3], [sp,#4*(15)] 722 vadd.i32 $d1,$d0,$t0 @ counter+1 723 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 724 vmov $c1,$c0 725 ldr @t[2], [sp,#4*(13)] 726 vmov $c2,$c0 727 ldr @x[14],[sp,#4*(14)] 728 vadd.i32 $d2,$d1,$t0 @ counter+2 729 str @t[3], [sp,#4*(16+15)] 730 mov @t[3],#10 731 add @x[12],@x[12],#3 @ counter+3 732 b .Loop_neon 733 734.align 4 735.Loop_neon: 736 subs @t[3],@t[3],#1 737___ 738 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); 739 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); 740 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); 741 my @thread3=&ROUND(0,4,8,12); 742 743 foreach (@thread0) { 744 eval; eval(shift(@thread3)); 745 eval(shift(@thread1)); eval(shift(@thread3)); 746 eval(shift(@thread2)); eval(shift(@thread3)); 747 } 748 749 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); 750 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); 751 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); 752 @thread3=&ROUND(0,5,10,15); 753 754 foreach (@thread0) { 755 eval; eval(shift(@thread3)); 756 eval(shift(@thread1)); eval(shift(@thread3)); 757 eval(shift(@thread2)); eval(shift(@thread3)); 758 } 759$code.=<<___; 760 bne .Loop_neon 761 762 add @t[3],sp,#32 763 vld1.32 {$t0-$t1},[sp] @ load key material 764 vld1.32 {$t2-$t3},[@t[3]] 765 766 ldr @t[3],[sp,#4*(32+2)] @ load len 767 768 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 769 str @t[1], [sp,#4*(16+9)] 770 str @x[12],[sp,#4*(16+12)] 771 str @t[2], [sp,#4*(16+13)] 772 str @x[14],[sp,#4*(16+14)] 773 774 @ at this point we have first half of 512-bit result in 775 @ @x[0-7] and second half at sp+4*(16+8) 776 777 ldr r12,[sp,#4*(32+1)] @ load inp 778 ldr r14,[sp,#4*(32+0)] @ load out 779 780 vadd.i32 $a0,$a0,$t0 @ accumulate key material 781 vadd.i32 $a1,$a1,$t0 782 vadd.i32 $a2,$a2,$t0 783 vldr $t0#lo,[sp,#4*(16+0)] @ one 784 785 vadd.i32 $b0,$b0,$t1 786 vadd.i32 $b1,$b1,$t1 787 vadd.i32 $b2,$b2,$t1 788 vldr $t1#lo,[sp,#4*(16+2)] @ two 789 790 vadd.i32 $c0,$c0,$t2 791 vadd.i32 $c1,$c1,$t2 792 vadd.i32 $c2,$c2,$t2 793 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 794 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 795 796 vadd.i32 $d0,$d0,$t3 797 vadd.i32 $d1,$d1,$t3 798 vadd.i32 $d2,$d2,$t3 799 800 cmp @t[3],#64*4 801 blo .Ltail_neon 802 803 vld1.8 {$t0-$t1},[r12]! @ load input 804 mov @t[3],sp 805 vld1.8 {$t2-$t3},[r12]! 806 veor $a0,$a0,$t0 @ xor with input 807 veor $b0,$b0,$t1 808 vld1.8 {$t0-$t1},[r12]! 809 veor $c0,$c0,$t2 810 veor $d0,$d0,$t3 811 vld1.8 {$t2-$t3},[r12]! 812 813 veor $a1,$a1,$t0 814 vst1.8 {$a0-$b0},[r14]! @ store output 815 veor $b1,$b1,$t1 816 vld1.8 {$t0-$t1},[r12]! 817 veor $c1,$c1,$t2 818 vst1.8 {$c0-$d0},[r14]! 819 veor $d1,$d1,$t3 820 vld1.8 {$t2-$t3},[r12]! 821 822 veor $a2,$a2,$t0 823 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration 824 veor $t0#hi,$t0#hi,$t0#hi 825 vldr $t0#lo,[sp,#4*(16+4)] @ four 826 veor $b2,$b2,$t1 827 vld1.32 {$c0-$d0},[@t[3]] 828 veor $c2,$c2,$t2 829 vst1.8 {$a1-$b1},[r14]! 830 veor $d2,$d2,$t3 831 vst1.8 {$c1-$d1},[r14]! 832 833 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value 834 vldr $t0#lo,[sp,#4*(16+0)] @ one 835 836 ldmia sp,{@t[0]-@t[3]} @ load key material 837 add @x[0],@x[0],@t[0] @ accumulate key material 838 ldr @t[0],[r12],#16 @ load input 839 vst1.8 {$a2-$b2},[r14]! 840 add @x[1],@x[1],@t[1] 841 ldr @t[1],[r12,#-12] 842 vst1.8 {$c2-$d2},[r14]! 843 add @x[2],@x[2],@t[2] 844 ldr @t[2],[r12,#-8] 845 add @x[3],@x[3],@t[3] 846 ldr @t[3],[r12,#-4] 847# ifdef __ARMEB__ 848 rev @x[0],@x[0] 849 rev @x[1],@x[1] 850 rev @x[2],@x[2] 851 rev @x[3],@x[3] 852# endif 853 eor @x[0],@x[0],@t[0] @ xor with input 854 add @t[0],sp,#4*(4) 855 eor @x[1],@x[1],@t[1] 856 str @x[0],[r14],#16 @ store output 857 eor @x[2],@x[2],@t[2] 858 str @x[1],[r14,#-12] 859 eor @x[3],@x[3],@t[3] 860 ldmia @t[0],{@t[0]-@t[3]} @ load key material 861 str @x[2],[r14,#-8] 862 str @x[3],[r14,#-4] 863 864 add @x[4],@x[4],@t[0] @ accumulate key material 865 ldr @t[0],[r12],#16 @ load input 866 add @x[5],@x[5],@t[1] 867 ldr @t[1],[r12,#-12] 868 add @x[6],@x[6],@t[2] 869 ldr @t[2],[r12,#-8] 870 add @x[7],@x[7],@t[3] 871 ldr @t[3],[r12,#-4] 872# ifdef __ARMEB__ 873 rev @x[4],@x[4] 874 rev @x[5],@x[5] 875 rev @x[6],@x[6] 876 rev @x[7],@x[7] 877# endif 878 eor @x[4],@x[4],@t[0] 879 add @t[0],sp,#4*(8) 880 eor @x[5],@x[5],@t[1] 881 str @x[4],[r14],#16 @ store output 882 eor @x[6],@x[6],@t[2] 883 str @x[5],[r14,#-12] 884 eor @x[7],@x[7],@t[3] 885 ldmia @t[0],{@t[0]-@t[3]} @ load key material 886 str @x[6],[r14,#-8] 887 add @x[0],sp,#4*(16+8) 888 str @x[7],[r14,#-4] 889 890 ldmia @x[0],{@x[0]-@x[7]} @ load second half 891 892 add @x[0],@x[0],@t[0] @ accumulate key material 893 ldr @t[0],[r12],#16 @ load input 894 add @x[1],@x[1],@t[1] 895 ldr @t[1],[r12,#-12] 896# ifdef __thumb2__ 897 it hi 898# endif 899 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 900 add @x[2],@x[2],@t[2] 901 ldr @t[2],[r12,#-8] 902# ifdef __thumb2__ 903 it hi 904# endif 905 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 906 add @x[3],@x[3],@t[3] 907 ldr @t[3],[r12,#-4] 908# ifdef __ARMEB__ 909 rev @x[0],@x[0] 910 rev @x[1],@x[1] 911 rev @x[2],@x[2] 912 rev @x[3],@x[3] 913# endif 914 eor @x[0],@x[0],@t[0] 915 add @t[0],sp,#4*(12) 916 eor @x[1],@x[1],@t[1] 917 str @x[0],[r14],#16 @ store output 918 eor @x[2],@x[2],@t[2] 919 str @x[1],[r14,#-12] 920 eor @x[3],@x[3],@t[3] 921 ldmia @t[0],{@t[0]-@t[3]} @ load key material 922 str @x[2],[r14,#-8] 923 str @x[3],[r14,#-4] 924 925 add @x[4],@x[4],@t[0] @ accumulate key material 926 add @t[0],@t[0],#4 @ next counter value 927 add @x[5],@x[5],@t[1] 928 str @t[0],[sp,#4*(12)] @ save next counter value 929 ldr @t[0],[r12],#16 @ load input 930 add @x[6],@x[6],@t[2] 931 add @x[4],@x[4],#3 @ counter+3 932 ldr @t[1],[r12,#-12] 933 add @x[7],@x[7],@t[3] 934 ldr @t[2],[r12,#-8] 935 ldr @t[3],[r12,#-4] 936# ifdef __ARMEB__ 937 rev @x[4],@x[4] 938 rev @x[5],@x[5] 939 rev @x[6],@x[6] 940 rev @x[7],@x[7] 941# endif 942 eor @x[4],@x[4],@t[0] 943# ifdef __thumb2__ 944 it hi 945# endif 946 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len 947 eor @x[5],@x[5],@t[1] 948 eor @x[6],@x[6],@t[2] 949 str @x[4],[r14],#16 @ store output 950 eor @x[7],@x[7],@t[3] 951 str @x[5],[r14,#-12] 952 sub @t[3],@t[0],#64*4 @ len-=64*4 953 str @x[6],[r14,#-8] 954 str @x[7],[r14,#-4] 955 bhi .Loop_neon_outer 956 957 b .Ldone_neon 958 959.align 4 960.Lbreak_neon: 961 @ harmonize NEON and integer-only stack frames: load data 962 @ from NEON frame, but save to integer-only one; distance 963 @ between the two is 4*(32+4+16-32)=4*(20). 964 965 str @t[3], [sp,#4*(20+32+2)] @ save len 966 add @t[3],sp,#4*(32+4) 967 str r12, [sp,#4*(20+32+1)] @ save inp 968 str r14, [sp,#4*(20+32+0)] @ save out 969 970 ldr @x[12],[sp,#4*(16+10)] 971 ldr @x[14],[sp,#4*(16+11)] 972 vldmia @t[3],{d8-d15} @ fulfill ABI requirement 973 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" 974 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" 975 976 ldr @t[3], [sp,#4*(15)] 977 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 978 ldr @t[2], [sp,#4*(13)] 979 ldr @x[14],[sp,#4*(14)] 980 str @t[3], [sp,#4*(20+16+15)] 981 add @t[3],sp,#4*(20) 982 vst1.32 {$a0-$b0},[@t[3]]! @ copy key 983 add sp,sp,#4*(20) @ switch frame 984 vst1.32 {$c0-$d0},[@t[3]] 985 mov @t[3],#10 986 b .Loop @ go integer-only 987 988.align 4 989.Ltail_neon: 990 cmp @t[3],#64*3 991 bhs .L192_or_more_neon 992 cmp @t[3],#64*2 993 bhs .L128_or_more_neon 994 cmp @t[3],#64*1 995 bhs .L64_or_more_neon 996 997 add @t[0],sp,#4*(8) 998 vst1.8 {$a0-$b0},[sp] 999 add @t[2],sp,#4*(0) 1000 vst1.8 {$c0-$d0},[@t[0]] 1001 b .Loop_tail_neon 1002 1003.align 4 1004.L64_or_more_neon: 1005 vld1.8 {$t0-$t1},[r12]! 1006 vld1.8 {$t2-$t3},[r12]! 1007 veor $a0,$a0,$t0 1008 veor $b0,$b0,$t1 1009 veor $c0,$c0,$t2 1010 veor $d0,$d0,$t3 1011 vst1.8 {$a0-$b0},[r14]! 1012 vst1.8 {$c0-$d0},[r14]! 1013 1014 beq .Ldone_neon 1015 1016 add @t[0],sp,#4*(8) 1017 vst1.8 {$a1-$b1},[sp] 1018 add @t[2],sp,#4*(0) 1019 vst1.8 {$c1-$d1},[@t[0]] 1020 sub @t[3],@t[3],#64*1 @ len-=64*1 1021 b .Loop_tail_neon 1022 1023.align 4 1024.L128_or_more_neon: 1025 vld1.8 {$t0-$t1},[r12]! 1026 vld1.8 {$t2-$t3},[r12]! 1027 veor $a0,$a0,$t0 1028 veor $b0,$b0,$t1 1029 vld1.8 {$t0-$t1},[r12]! 1030 veor $c0,$c0,$t2 1031 veor $d0,$d0,$t3 1032 vld1.8 {$t2-$t3},[r12]! 1033 1034 veor $a1,$a1,$t0 1035 veor $b1,$b1,$t1 1036 vst1.8 {$a0-$b0},[r14]! 1037 veor $c1,$c1,$t2 1038 vst1.8 {$c0-$d0},[r14]! 1039 veor $d1,$d1,$t3 1040 vst1.8 {$a1-$b1},[r14]! 1041 vst1.8 {$c1-$d1},[r14]! 1042 1043 beq .Ldone_neon 1044 1045 add @t[0],sp,#4*(8) 1046 vst1.8 {$a2-$b2},[sp] 1047 add @t[2],sp,#4*(0) 1048 vst1.8 {$c2-$d2},[@t[0]] 1049 sub @t[3],@t[3],#64*2 @ len-=64*2 1050 b .Loop_tail_neon 1051 1052.align 4 1053.L192_or_more_neon: 1054 vld1.8 {$t0-$t1},[r12]! 1055 vld1.8 {$t2-$t3},[r12]! 1056 veor $a0,$a0,$t0 1057 veor $b0,$b0,$t1 1058 vld1.8 {$t0-$t1},[r12]! 1059 veor $c0,$c0,$t2 1060 veor $d0,$d0,$t3 1061 vld1.8 {$t2-$t3},[r12]! 1062 1063 veor $a1,$a1,$t0 1064 veor $b1,$b1,$t1 1065 vld1.8 {$t0-$t1},[r12]! 1066 veor $c1,$c1,$t2 1067 vst1.8 {$a0-$b0},[r14]! 1068 veor $d1,$d1,$t3 1069 vld1.8 {$t2-$t3},[r12]! 1070 1071 veor $a2,$a2,$t0 1072 vst1.8 {$c0-$d0},[r14]! 1073 veor $b2,$b2,$t1 1074 vst1.8 {$a1-$b1},[r14]! 1075 veor $c2,$c2,$t2 1076 vst1.8 {$c1-$d1},[r14]! 1077 veor $d2,$d2,$t3 1078 vst1.8 {$a2-$b2},[r14]! 1079 vst1.8 {$c2-$d2},[r14]! 1080 1081 beq .Ldone_neon 1082 1083 ldmia sp,{@t[0]-@t[3]} @ load key material 1084 add @x[0],@x[0],@t[0] @ accumulate key material 1085 add @t[0],sp,#4*(4) 1086 add @x[1],@x[1],@t[1] 1087 add @x[2],@x[2],@t[2] 1088 add @x[3],@x[3],@t[3] 1089 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1090 1091 add @x[4],@x[4],@t[0] @ accumulate key material 1092 add @t[0],sp,#4*(8) 1093 add @x[5],@x[5],@t[1] 1094 add @x[6],@x[6],@t[2] 1095 add @x[7],@x[7],@t[3] 1096 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1097# ifdef __ARMEB__ 1098 rev @x[0],@x[0] 1099 rev @x[1],@x[1] 1100 rev @x[2],@x[2] 1101 rev @x[3],@x[3] 1102 rev @x[4],@x[4] 1103 rev @x[5],@x[5] 1104 rev @x[6],@x[6] 1105 rev @x[7],@x[7] 1106# endif 1107 stmia sp,{@x[0]-@x[7]} 1108 add @x[0],sp,#4*(16+8) 1109 1110 ldmia @x[0],{@x[0]-@x[7]} @ load second half 1111 1112 add @x[0],@x[0],@t[0] @ accumulate key material 1113 add @t[0],sp,#4*(12) 1114 add @x[1],@x[1],@t[1] 1115 add @x[2],@x[2],@t[2] 1116 add @x[3],@x[3],@t[3] 1117 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1118 1119 add @x[4],@x[4],@t[0] @ accumulate key material 1120 add @t[0],sp,#4*(8) 1121 add @x[5],@x[5],@t[1] 1122 add @x[4],@x[4],#3 @ counter+3 1123 add @x[6],@x[6],@t[2] 1124 add @x[7],@x[7],@t[3] 1125 ldr @t[3],[sp,#4*(32+2)] @ re-load len 1126# ifdef __ARMEB__ 1127 rev @x[0],@x[0] 1128 rev @x[1],@x[1] 1129 rev @x[2],@x[2] 1130 rev @x[3],@x[3] 1131 rev @x[4],@x[4] 1132 rev @x[5],@x[5] 1133 rev @x[6],@x[6] 1134 rev @x[7],@x[7] 1135# endif 1136 stmia @t[0],{@x[0]-@x[7]} 1137 add @t[2],sp,#4*(0) 1138 sub @t[3],@t[3],#64*3 @ len-=64*3 1139 1140.Loop_tail_neon: 1141 ldrb @t[0],[@t[2]],#1 @ read buffer on stack 1142 ldrb @t[1],[r12],#1 @ read input 1143 subs @t[3],@t[3],#1 1144 eor @t[0],@t[0],@t[1] 1145 strb @t[0],[r14],#1 @ store output 1146 bne .Loop_tail_neon 1147 1148.Ldone_neon: 1149 add sp,sp,#4*(32+4) 1150 vldmia sp,{d8-d15} 1151 add sp,sp,#4*(16+3) 1152 ldmia sp!,{r4-r11,pc} 1153.size ChaCha20_neon,.-ChaCha20_neon 1154.comm OPENSSL_armcap_P,4,4 1155#endif 1156___ 1157}}} 1158 1159foreach (split("\n",$code)) { 1160 s/\`([^\`]*)\`/eval $1/geo; 1161 1162 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1163 1164 print $_,"\n"; 1165} 1166close STDOUT or die "error closing STDOUT"; 1167