• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38#
39# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
40#	and are still same even for updated module;
41
42$flavour = shift;
43$output  = shift;
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
48die "can't locate arm-xlate.pl";
49
50open OUT,"| \"$^X\" $xlate $flavour $output";
51*STDOUT=*OUT;
52
53$prefix="aes_hw";
54
55$code=<<___;
56#include <ring-core/arm_arch.h>
57
58#if __ARM_MAX_ARCH__>=7
59.text
60___
61$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
62$code.=<<___						if ($flavour !~ /64/);
63.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
64.fpu	neon
65.code	32
66#undef	__thumb2__
67___
68
69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
71# maintain both 32- and 64-bit codes within single module and
72# transliterate common code to either flavour with regex vodoo.
73#
74{{{
75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
77	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
78
79
80# On AArch64, put the data .rodata and use adrp + add for compatibility with
81# execute-only memory. On AArch32, put it in .text and use adr.
82$code.= ".section .rodata\n" if ($flavour =~ /64/);
83$code.=<<___;
84.align	5
85.Lrcon:
86.long	0x01,0x01,0x01,0x01
87.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
88.long	0x1b,0x1b,0x1b,0x1b
89
90.text
91
92.globl	${prefix}_set_encrypt_key
93.type	${prefix}_set_encrypt_key,%function
94.align	5
95${prefix}_set_encrypt_key:
96.Lenc_key:
97___
98$code.=<<___	if ($flavour =~ /64/);
99	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
100	AARCH64_VALID_CALL_TARGET
101	stp	x29,x30,[sp,#-16]!
102	add	x29,sp,#0
103___
104$code.=<<___;
105	mov	$ptr,#-1
106	cmp	$inp,#0
107	b.eq	.Lenc_key_abort
108	cmp	$out,#0
109	b.eq	.Lenc_key_abort
110	mov	$ptr,#-2
111	cmp	$bits,#128
112	b.lt	.Lenc_key_abort
113	cmp	$bits,#256
114	b.gt	.Lenc_key_abort
115	tst	$bits,#0x3f
116	b.ne	.Lenc_key_abort
117
118___
119$code.=<<___	if ($flavour =~ /64/);
120	adrp	$ptr,:pg_hi21:.Lrcon
121	add	$ptr,$ptr,:lo12:.Lrcon
122___
123$code.=<<___	if ($flavour !~ /64/);
124	adr	$ptr,.Lrcon
125___
126$code.=<<___;
127	cmp	$bits,#192
128
129	veor	$zero,$zero,$zero
130	vld1.8	{$in0},[$inp],#16
131	mov	$bits,#8		// reuse $bits
132	vld1.32	{$rcon,$mask},[$ptr],#32
133
134	b.lt	.Loop128
135	// 192-bit key support was removed.
136	b	.L256
137
138.align	4
139.Loop128:
140	vtbl.8	$key,{$in0},$mask
141	vext.8	$tmp,$zero,$in0,#12
142	vst1.32	{$in0},[$out],#16
143	aese	$key,$zero
144	subs	$bits,$bits,#1
145
146	veor	$in0,$in0,$tmp
147	vext.8	$tmp,$zero,$tmp,#12
148	veor	$in0,$in0,$tmp
149	vext.8	$tmp,$zero,$tmp,#12
150	 veor	$key,$key,$rcon
151	veor	$in0,$in0,$tmp
152	vshl.u8	$rcon,$rcon,#1
153	veor	$in0,$in0,$key
154	b.ne	.Loop128
155
156	vld1.32	{$rcon},[$ptr]
157
158	vtbl.8	$key,{$in0},$mask
159	vext.8	$tmp,$zero,$in0,#12
160	vst1.32	{$in0},[$out],#16
161	aese	$key,$zero
162
163	veor	$in0,$in0,$tmp
164	vext.8	$tmp,$zero,$tmp,#12
165	veor	$in0,$in0,$tmp
166	vext.8	$tmp,$zero,$tmp,#12
167	 veor	$key,$key,$rcon
168	veor	$in0,$in0,$tmp
169	vshl.u8	$rcon,$rcon,#1
170	veor	$in0,$in0,$key
171
172	vtbl.8	$key,{$in0},$mask
173	vext.8	$tmp,$zero,$in0,#12
174	vst1.32	{$in0},[$out],#16
175	aese	$key,$zero
176
177	veor	$in0,$in0,$tmp
178	vext.8	$tmp,$zero,$tmp,#12
179	veor	$in0,$in0,$tmp
180	vext.8	$tmp,$zero,$tmp,#12
181	 veor	$key,$key,$rcon
182	veor	$in0,$in0,$tmp
183	veor	$in0,$in0,$key
184	vst1.32	{$in0},[$out]
185	add	$out,$out,#0x50
186
187	mov	$rounds,#10
188	b	.Ldone
189
190// 192-bit key support was removed.
191
192.align	4
193.L256:
194	vld1.8	{$in1},[$inp]
195	mov	$bits,#7
196	mov	$rounds,#14
197	vst1.32	{$in0},[$out],#16
198
199.Loop256:
200	vtbl.8	$key,{$in1},$mask
201	vext.8	$tmp,$zero,$in0,#12
202	vst1.32	{$in1},[$out],#16
203	aese	$key,$zero
204	subs	$bits,$bits,#1
205
206	veor	$in0,$in0,$tmp
207	vext.8	$tmp,$zero,$tmp,#12
208	veor	$in0,$in0,$tmp
209	vext.8	$tmp,$zero,$tmp,#12
210	 veor	$key,$key,$rcon
211	veor	$in0,$in0,$tmp
212	vshl.u8	$rcon,$rcon,#1
213	veor	$in0,$in0,$key
214	vst1.32	{$in0},[$out],#16
215	b.eq	.Ldone
216
217	vdup.32	$key,${in0}[3]		// just splat
218	vext.8	$tmp,$zero,$in1,#12
219	aese	$key,$zero
220
221	veor	$in1,$in1,$tmp
222	vext.8	$tmp,$zero,$tmp,#12
223	veor	$in1,$in1,$tmp
224	vext.8	$tmp,$zero,$tmp,#12
225	veor	$in1,$in1,$tmp
226
227	veor	$in1,$in1,$key
228	b	.Loop256
229
230.Ldone:
231	str	$rounds,[$out]
232	mov	$ptr,#0
233
234.Lenc_key_abort:
235	mov	x0,$ptr			// return value
236	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
237	ret
238.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
239___
240}}}
241{{{
242sub gen_block () {
243my $dir = shift;
244my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
245my ($inp,$out,$key)=map("x$_",(0..2));
246my $rounds="w3";
247my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
248
249$code.=<<___;
250.globl	${prefix}_${dir}crypt
251.type	${prefix}_${dir}crypt,%function
252.align	5
253${prefix}_${dir}crypt:
254	AARCH64_VALID_CALL_TARGET
255	ldr	$rounds,[$key,#240]
256	vld1.32	{$rndkey0},[$key],#16
257	vld1.8	{$inout},[$inp]
258	sub	$rounds,$rounds,#2
259	vld1.32	{$rndkey1},[$key],#16
260
261.Loop_${dir}c:
262	aes$e	$inout,$rndkey0
263	aes$mc	$inout,$inout
264	vld1.32	{$rndkey0},[$key],#16
265	subs	$rounds,$rounds,#2
266	aes$e	$inout,$rndkey1
267	aes$mc	$inout,$inout
268	vld1.32	{$rndkey1},[$key],#16
269	b.gt	.Loop_${dir}c
270
271	aes$e	$inout,$rndkey0
272	aes$mc	$inout,$inout
273	vld1.32	{$rndkey0},[$key]
274	aes$e	$inout,$rndkey1
275	veor	$inout,$inout,$rndkey0
276
277	vst1.8	{$inout},[$out]
278	ret
279.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
280___
281}
282&gen_block("en");
283# Decryption removed in *ring*.
284# &gen_block("de");
285}}}
286{{{
287my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
288my ($rounds,$cnt,$key_)=("w5","w6","x7");
289my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
290my $step="x12";		# aliases with $tctr2
291
292my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
293my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
294
295my ($dat,$tmp)=($dat0,$tmp0);
296
297### q8-q15	preloaded key schedule
298
299$code.=<<___;
300.globl	${prefix}_ctr32_encrypt_blocks
301.type	${prefix}_ctr32_encrypt_blocks,%function
302.align	5
303${prefix}_ctr32_encrypt_blocks:
304___
305$code.=<<___	if ($flavour =~ /64/);
306	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
307	AARCH64_VALID_CALL_TARGET
308	stp		x29,x30,[sp,#-16]!
309	add		x29,sp,#0
310___
311$code.=<<___	if ($flavour !~ /64/);
312	mov		ip,sp
313	stmdb		sp!,{r4-r10,lr}
314	vstmdb		sp!,{d8-d15}            @ ABI specification says so
315	ldr		r4, [ip]		@ load remaining arg
316___
317$code.=<<___;
318	ldr		$rounds,[$key,#240]
319
320	ldr		$ctr, [$ivp, #12]
321	vld1.32		{$dat0},[$ivp]
322
323	vld1.32		{q8-q9},[$key]		// load key schedule...
324	sub		$rounds,$rounds,#4
325	mov		$step,#16
326	cmp		$len,#2
327	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
328	sub		$rounds,$rounds,#2
329	vld1.32		{q12-q13},[$key_],#32
330	vld1.32		{q14-q15},[$key_],#32
331	vld1.32		{$rndlast},[$key_]
332	add		$key_,$key,#32
333	mov		$cnt,$rounds
334	cclr		$step,lo
335
336	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
337	// affected by silicon errata #1742098 [0] and #1655431 [1],
338	// respectively, where the second instruction of an aese/aesmc
339	// instruction pair may execute twice if an interrupt is taken right
340	// after the first instruction consumes an input register of which a
341	// single 32-bit lane has been updated the last time it was modified.
342	//
343	// This function uses a counter in one 32-bit lane. The vmov.32 lines
344	// could write to $dat1 and $dat2 directly, but that trips this bugs.
345	// We write to $ivec and copy to the final register as a workaround.
346	//
347	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
348	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
349#ifndef __ARMEB__
350	rev		$ctr, $ctr
351#endif
352	add		$tctr1, $ctr, #1
353	vorr		$ivec,$dat0,$dat0
354	rev		$tctr1, $tctr1
355	vmov.32		${ivec}[3],$tctr1
356	add		$ctr, $ctr, #2
357	vorr		$dat1,$ivec,$ivec
358	b.ls		.Lctr32_tail
359	rev		$tctr2, $ctr
360	vmov.32		${ivec}[3],$tctr2
361	sub		$len,$len,#3		// bias
362	vorr		$dat2,$ivec,$ivec
363	b		.Loop3x_ctr32
364
365.align	4
366.Loop3x_ctr32:
367	aese		$dat0,q8
368	aesmc		$dat0,$dat0
369	aese		$dat1,q8
370	aesmc		$dat1,$dat1
371	aese		$dat2,q8
372	aesmc		$dat2,$dat2
373	vld1.32		{q8},[$key_],#16
374	subs		$cnt,$cnt,#2
375	aese		$dat0,q9
376	aesmc		$dat0,$dat0
377	aese		$dat1,q9
378	aesmc		$dat1,$dat1
379	aese		$dat2,q9
380	aesmc		$dat2,$dat2
381	vld1.32		{q9},[$key_],#16
382	b.gt		.Loop3x_ctr32
383
384	aese		$dat0,q8
385	aesmc		$tmp0,$dat0
386	aese		$dat1,q8
387	aesmc		$tmp1,$dat1
388	 vld1.8		{$in0},[$inp],#16
389	 add		$tctr0,$ctr,#1
390	aese		$dat2,q8
391	aesmc		$dat2,$dat2
392	 vld1.8		{$in1},[$inp],#16
393	 rev		$tctr0,$tctr0
394	aese		$tmp0,q9
395	aesmc		$tmp0,$tmp0
396	aese		$tmp1,q9
397	aesmc		$tmp1,$tmp1
398	 vld1.8		{$in2},[$inp],#16
399	 mov		$key_,$key
400	aese		$dat2,q9
401	aesmc		$tmp2,$dat2
402	aese		$tmp0,q12
403	aesmc		$tmp0,$tmp0
404	aese		$tmp1,q12
405	aesmc		$tmp1,$tmp1
406	 veor		$in0,$in0,$rndlast
407	 add		$tctr1,$ctr,#2
408	aese		$tmp2,q12
409	aesmc		$tmp2,$tmp2
410	 veor		$in1,$in1,$rndlast
411	 add		$ctr,$ctr,#3
412	aese		$tmp0,q13
413	aesmc		$tmp0,$tmp0
414	aese		$tmp1,q13
415	aesmc		$tmp1,$tmp1
416	 // Note the logic to update $dat0, $dat1, and $dat1 is written to work
417	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
418	 // 32-bit mode. See the comment above.
419	 veor		$in2,$in2,$rndlast
420	 vmov.32	${ivec}[3], $tctr0
421	aese		$tmp2,q13
422	aesmc		$tmp2,$tmp2
423	 vorr		$dat0,$ivec,$ivec
424	 rev		$tctr1,$tctr1
425	aese		$tmp0,q14
426	aesmc		$tmp0,$tmp0
427	 vmov.32	${ivec}[3], $tctr1
428	 rev		$tctr2,$ctr
429	aese		$tmp1,q14
430	aesmc		$tmp1,$tmp1
431	 vorr		$dat1,$ivec,$ivec
432	 vmov.32	${ivec}[3], $tctr2
433	aese		$tmp2,q14
434	aesmc		$tmp2,$tmp2
435	 vorr		$dat2,$ivec,$ivec
436	 subs		$len,$len,#3
437	aese		$tmp0,q15
438	aese		$tmp1,q15
439	aese		$tmp2,q15
440
441	veor		$in0,$in0,$tmp0
442	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
443	vst1.8		{$in0},[$out],#16
444	veor		$in1,$in1,$tmp1
445	 mov		$cnt,$rounds
446	vst1.8		{$in1},[$out],#16
447	veor		$in2,$in2,$tmp2
448	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
449	vst1.8		{$in2},[$out],#16
450	b.hs		.Loop3x_ctr32
451
452	adds		$len,$len,#3
453	b.eq		.Lctr32_done
454	cmp		$len,#1
455	mov		$step,#16
456	cclr		$step,eq
457
458.Lctr32_tail:
459	aese		$dat0,q8
460	aesmc		$dat0,$dat0
461	aese		$dat1,q8
462	aesmc		$dat1,$dat1
463	vld1.32		{q8},[$key_],#16
464	subs		$cnt,$cnt,#2
465	aese		$dat0,q9
466	aesmc		$dat0,$dat0
467	aese		$dat1,q9
468	aesmc		$dat1,$dat1
469	vld1.32		{q9},[$key_],#16
470	b.gt		.Lctr32_tail
471
472	aese		$dat0,q8
473	aesmc		$dat0,$dat0
474	aese		$dat1,q8
475	aesmc		$dat1,$dat1
476	aese		$dat0,q9
477	aesmc		$dat0,$dat0
478	aese		$dat1,q9
479	aesmc		$dat1,$dat1
480	 vld1.8		{$in0},[$inp],$step
481	aese		$dat0,q12
482	aesmc		$dat0,$dat0
483	aese		$dat1,q12
484	aesmc		$dat1,$dat1
485	 vld1.8		{$in1},[$inp]
486	aese		$dat0,q13
487	aesmc		$dat0,$dat0
488	aese		$dat1,q13
489	aesmc		$dat1,$dat1
490	 veor		$in0,$in0,$rndlast
491	aese		$dat0,q14
492	aesmc		$dat0,$dat0
493	aese		$dat1,q14
494	aesmc		$dat1,$dat1
495	 veor		$in1,$in1,$rndlast
496	aese		$dat0,q15
497	aese		$dat1,q15
498
499	cmp		$len,#1
500	veor		$in0,$in0,$dat0
501	veor		$in1,$in1,$dat1
502	vst1.8		{$in0},[$out],#16
503	b.eq		.Lctr32_done
504	vst1.8		{$in1},[$out]
505
506.Lctr32_done:
507___
508$code.=<<___	if ($flavour !~ /64/);
509	vldmia		sp!,{d8-d15}
510	ldmia		sp!,{r4-r10,pc}
511___
512$code.=<<___	if ($flavour =~ /64/);
513	ldr		x29,[sp],#16
514	ret
515___
516$code.=<<___;
517.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
518___
519}}}
520$code.=<<___;
521#endif
522___
523########################################
524if ($flavour =~ /64/) {			######## 64-bit code
525    my %opcode = (
526	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
527	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
528
529    local *unaes = sub {
530	my ($mnemonic,$arg)=@_;
531
532	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
533	sprintf ".inst\t0x%08x\t//%s %s",
534			$opcode{$mnemonic}|$1|($2<<5),
535			$mnemonic,$arg;
536    };
537
538    foreach(split("\n",$code)) {
539	s/\`([^\`]*)\`/eval($1)/geo;
540
541	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
542	s/@\s/\/\//o;			# old->new style commentary
543
544	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
545	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
546	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
547	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
548	s/vext\.8/ext/o		or
549	s/vrev32\.8/rev32/o	or
550	s/vtst\.8/cmtst/o	or
551	s/vshr/ushr/o		or
552	s/^(\s+)v/$1/o		or	# strip off v prefix
553	s/\bbx\s+lr\b/ret/o;
554
555	# fix up remaining legacy suffixes
556	s/\.[ui]?8//o;
557	m/\],#8/o and s/\.16b/\.8b/go;
558	s/\.[ui]?32//o and s/\.16b/\.4s/go;
559	s/\.[ui]?64//o and s/\.16b/\.2d/go;
560	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
561
562	print $_,"\n";
563    }
564} else {				######## 32-bit code
565    my %opcode = (
566	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
567	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
568
569    local *unaes = sub {
570	my ($mnemonic,$arg)=@_;
571
572	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
573	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
574					 |(($2&7)<<1) |(($2&8)<<2);
575	    # since ARMv7 instructions are always encoded little-endian.
576	    # correct solution is to use .inst directive, but older
577	    # assemblers don't implement it:-(
578	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
579			$word&0xff,($word>>8)&0xff,
580			($word>>16)&0xff,($word>>24)&0xff,
581			$mnemonic,$arg;
582	}
583    };
584
585    sub unvtbl {
586	my $arg=shift;
587
588	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
589	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
590		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
591    }
592
593    sub unvdup32 {
594	my $arg=shift;
595
596	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
597	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
598    }
599
600    sub unvmov32 {
601	my $arg=shift;
602
603	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
604	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
605    }
606
607    foreach(split("\n",$code)) {
608	s/\`([^\`]*)\`/eval($1)/geo;
609
610	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
611	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
612	s/\/\/\s?/@ /o;				# new->old style commentary
613
614	# fix up remaining new-style suffixes
615	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
616	s/\],#[0-9]+/]!/o;
617
618	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
619	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
620	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
621	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
622	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
623	s/^(\s+)b\./$1b/o				or
624	s/^(\s+)mov\./$1mov/o				or
625	s/^(\s+)ret/$1bx\tlr/o;
626
627	print $_,"\n";
628    }
629}
630
631close STDOUT or die "error closing STDOUT";
632