• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# March 2016
18#
19# Initial support for Fujitsu SPARC64 X/X+ comprises minimally
20# required key setup and single-block procedures.
21#
22# April 2016
23#
24# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
25# that parallelizable nature of CBC decrypt and CTR is not utilized
26# yet. CBC encrypt on the other hand is as good as it can possibly
27# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
28# This is ~6x faster than pure software implementation...
29#
30# July 2016
31#
32# Switch from faligndata to fshiftorx, which allows to omit alignaddr
33# instructions and improve single-block and short-input performance
34# with misaligned data.
35
36$output = pop and open STDOUT,">$output";
37
38{
39my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
40
41$code.=<<___;
42#ifndef __ASSEMBLER__
43# define __ASSEMBLER__ 1
44#endif
45#include "crypto/sparc_arch.h"
46
47#define LOCALS (STACK_BIAS+STACK_FRAME)
48
49.text
50
51.globl	aes_fx_encrypt
52.align	32
53aes_fx_encrypt:
54	and		$inp, 7, $tmp		! is input aligned?
55	andn		$inp, 7, $inp
56	ldd		[$key +  0], %f6	! round[0]
57	ldd		[$key +  8], %f8
58	mov		%o7, %g1
59	ld		[$key + 240], $rounds
60
611:	call		.+8
62	add		%o7, .Linp_align-1b, %o7
63
64	sll		$tmp, 3, $tmp
65	ldd		[$inp + 0], %f0		! load input
66	brz,pt		$tmp, .Lenc_inp_aligned
67	ldd		[$inp + 8], %f2
68
69	ldd		[%o7 + $tmp], %f14	! shift left params
70	ldd		[$inp + 16], %f4
71	fshiftorx	%f0, %f2, %f14, %f0
72	fshiftorx	%f2, %f4, %f14, %f2
73
74.Lenc_inp_aligned:
75	ldd		[$key + 16], %f10	! round[1]
76	ldd		[$key + 24], %f12
77
78	fxor		%f0, %f6, %f0		! ^=round[0]
79	fxor		%f2, %f8, %f2
80	ldd		[$key + 32], %f6	! round[2]
81	ldd		[$key + 40], %f8
82	add		$key, 32, $key
83	sub		$rounds, 4, $rounds
84
85.Loop_enc:
86	fmovd		%f0, %f4
87	faesencx	%f2, %f10, %f0
88	faesencx	%f4, %f12, %f2
89	ldd		[$key + 16], %f10
90	ldd		[$key + 24], %f12
91	add		$key, 32, $key
92
93	fmovd		%f0, %f4
94	faesencx	%f2, %f6, %f0
95	faesencx	%f4, %f8, %f2
96	ldd		[$key +  0], %f6
97	ldd		[$key +  8], %f8
98
99	brnz,a		$rounds, .Loop_enc
100	sub		$rounds, 2, $rounds
101
102	andcc		$out, 7, $tmp		! is output aligned?
103	andn		$out, 7, $out
104	mov		0xff, $mask
105	srl		$mask, $tmp, $mask
106	add		%o7, 64, %o7
107	sll		$tmp, 3, $tmp
108
109	fmovd		%f0, %f4
110	faesencx	%f2, %f10, %f0
111	faesencx	%f4, %f12, %f2
112	ldd		[%o7 + $tmp], %f14	! shift right params
113
114	fmovd		%f0, %f4
115	faesenclx	%f2, %f6, %f0
116	faesenclx	%f4, %f8, %f2
117
118	bnz,pn		%icc, .Lenc_out_unaligned
119	mov		%g1, %o7
120
121	std		%f0, [$out + 0]
122	retl
123	std		%f2, [$out + 8]
124
125.align	16
126.Lenc_out_unaligned:
127	add		$out, 16, $inp
128	orn		%g0, $mask, $tmp
129	fshiftorx	%f0, %f0, %f14, %f4
130	fshiftorx	%f0, %f2, %f14, %f6
131	fshiftorx	%f2, %f2, %f14, %f8
132
133	stda		%f4, [$out + $mask]0xc0	! partial store
134	std		%f6, [$out + 8]
135	stda		%f8, [$inp + $tmp]0xc0	! partial store
136	retl
137	nop
138.type	aes_fx_encrypt,#function
139.size	aes_fx_encrypt,.-aes_fx_encrypt
140
141.globl	aes_fx_decrypt
142.align	32
143aes_fx_decrypt:
144	and		$inp, 7, $tmp		! is input aligned?
145	andn		$inp, 7, $inp
146	ldd		[$key +  0], %f6	! round[0]
147	ldd		[$key +  8], %f8
148	mov		%o7, %g1
149	ld		[$key + 240], $rounds
150
1511:	call		.+8
152	add		%o7, .Linp_align-1b, %o7
153
154	sll		$tmp, 3, $tmp
155	ldd		[$inp + 0], %f0		! load input
156	brz,pt		$tmp, .Ldec_inp_aligned
157	ldd		[$inp + 8], %f2
158
159	ldd		[%o7 + $tmp], %f14	! shift left params
160	ldd		[$inp + 16], %f4
161	fshiftorx	%f0, %f2, %f14, %f0
162	fshiftorx	%f2, %f4, %f14, %f2
163
164.Ldec_inp_aligned:
165	ldd		[$key + 16], %f10	! round[1]
166	ldd		[$key + 24], %f12
167
168	fxor		%f0, %f6, %f0		! ^=round[0]
169	fxor		%f2, %f8, %f2
170	ldd		[$key + 32], %f6	! round[2]
171	ldd		[$key + 40], %f8
172	add		$key, 32, $key
173	sub		$rounds, 4, $rounds
174
175.Loop_dec:
176	fmovd		%f0, %f4
177	faesdecx	%f2, %f10, %f0
178	faesdecx	%f4, %f12, %f2
179	ldd		[$key + 16], %f10
180	ldd		[$key + 24], %f12
181	add		$key, 32, $key
182
183	fmovd		%f0, %f4
184	faesdecx	%f2, %f6, %f0
185	faesdecx	%f4, %f8, %f2
186	ldd		[$key +  0], %f6
187	ldd		[$key +  8], %f8
188
189	brnz,a		$rounds, .Loop_dec
190	sub		$rounds, 2, $rounds
191
192	andcc		$out, 7, $tmp		! is output aligned?
193	andn		$out, 7, $out
194	mov		0xff, $mask
195	srl		$mask, $tmp, $mask
196	add		%o7, 64, %o7
197	sll		$tmp, 3, $tmp
198
199	fmovd		%f0, %f4
200	faesdecx	%f2, %f10, %f0
201	faesdecx	%f4, %f12, %f2
202	ldd		[%o7 + $tmp], %f14	! shift right params
203
204	fmovd		%f0, %f4
205	faesdeclx	%f2, %f6, %f0
206	faesdeclx	%f4, %f8, %f2
207
208	bnz,pn		%icc, .Ldec_out_unaligned
209	mov		%g1, %o7
210
211	std		%f0, [$out + 0]
212	retl
213	std		%f2, [$out + 8]
214
215.align	16
216.Ldec_out_unaligned:
217	add		$out, 16, $inp
218	orn		%g0, $mask, $tmp
219	fshiftorx	%f0, %f0, %f14, %f4
220	fshiftorx	%f0, %f2, %f14, %f6
221	fshiftorx	%f2, %f2, %f14, %f8
222
223	stda		%f4, [$out + $mask]0xc0	! partial store
224	std		%f6, [$out + 8]
225	stda		%f8, [$inp + $tmp]0xc0	! partial store
226	retl
227	nop
228.type	aes_fx_decrypt,#function
229.size	aes_fx_decrypt,.-aes_fx_decrypt
230___
231}
232{
233my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
234$code.=<<___;
235.globl	aes_fx_set_decrypt_key
236.align	32
237aes_fx_set_decrypt_key:
238	b		.Lset_encrypt_key
239	mov		-1, $inc
240	retl
241	nop
242.type	aes_fx_set_decrypt_key,#function
243.size	aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
244
245.globl	aes_fx_set_encrypt_key
246.align	32
247aes_fx_set_encrypt_key:
248	mov		1, $inc
249	nop
250.Lset_encrypt_key:
251	and		$inp, 7, $tmp
252	andn		$inp, 7, $inp
253	sll		$tmp, 3, $tmp
254	mov		%o7, %g1
255
2561:	call		.+8
257	add		%o7, .Linp_align-1b, %o7
258
259	ldd		[%o7 + $tmp], %f10	! shift left params
260	mov		%g1, %o7
261
262	cmp		$bits, 192
263	ldd		[$inp + 0], %f0
264	bl,pt		%icc, .L128
265	ldd		[$inp + 8], %f2
266
267	be,pt		%icc, .L192
268	ldd		[$inp + 16], %f4
269	brz,pt		$tmp, .L256aligned
270	ldd		[$inp + 24], %f6
271
272	ldd		[$inp + 32], %f8
273	fshiftorx	%f0, %f2, %f10, %f0
274	fshiftorx	%f2, %f4, %f10, %f2
275	fshiftorx	%f4, %f6, %f10, %f4
276	fshiftorx	%f6, %f8, %f10, %f6
277
278.L256aligned:
279	mov		14, $bits
280	and		$inc, `14*16`, $tmp
281	st		$bits, [$out + 240]	! store rounds
282	add		$out, $tmp, $out	! start or end of key schedule
283	sllx		$inc, 4, $inc		! 16 or -16
284___
285for ($i=0; $i<6; $i++) {
286    $code.=<<___;
287	std		%f0, [$out + 0]
288	faeskeyx	%f6, `0x10+$i`, %f0
289	std		%f2, [$out + 8]
290	add		$out, $inc, $out
291	faeskeyx	%f0, 0x00, %f2
292	std		%f4, [$out + 0]
293	faeskeyx	%f2, 0x01, %f4
294	std		%f6, [$out + 8]
295	add		$out, $inc, $out
296	faeskeyx	%f4, 0x00, %f6
297___
298}
299$code.=<<___;
300	std		%f0, [$out + 0]
301	faeskeyx	%f6, `0x10+$i`, %f0
302	std		%f2, [$out + 8]
303	add		$out, $inc, $out
304	faeskeyx	%f0, 0x00, %f2
305	std		%f4,[$out + 0]
306	std		%f6,[$out + 8]
307	add		$out, $inc, $out
308	std		%f0,[$out + 0]
309	std		%f2,[$out + 8]
310	retl
311	xor		%o0, %o0, %o0		! return 0
312
313.align	16
314.L192:
315	brz,pt		$tmp, .L192aligned
316	nop
317
318	ldd		[$inp + 24], %f6
319	fshiftorx	%f0, %f2, %f10, %f0
320	fshiftorx	%f2, %f4, %f10, %f2
321	fshiftorx	%f4, %f6, %f10, %f4
322
323.L192aligned:
324	mov		12, $bits
325	and		$inc, `12*16`, $tmp
326	st		$bits, [$out + 240]	! store rounds
327	add		$out, $tmp, $out	! start or end of key schedule
328	sllx		$inc, 4, $inc		! 16 or -16
329___
330for ($i=0; $i<8; $i+=2) {
331    $code.=<<___;
332	std		%f0, [$out + 0]
333	faeskeyx	%f4, `0x10+$i`, %f0
334	std		%f2, [$out + 8]
335	add		$out, $inc, $out
336	faeskeyx	%f0, 0x00, %f2
337	std		%f4, [$out + 0]
338	faeskeyx	%f2, 0x00, %f4
339	std		%f0, [$out + 8]
340	add		$out, $inc, $out
341	faeskeyx	%f4, `0x10+$i+1`, %f0
342	std		%f2, [$out + 0]
343	faeskeyx	%f0, 0x00, %f2
344	std		%f4, [$out + 8]
345	add		$out, $inc, $out
346___
347$code.=<<___		if ($i<6);
348	faeskeyx	%f2, 0x00, %f4
349___
350}
351$code.=<<___;
352	std		%f0, [$out + 0]
353	std		%f2, [$out + 8]
354	retl
355	xor		%o0, %o0, %o0		! return 0
356
357.align	16
358.L128:
359	brz,pt		$tmp, .L128aligned
360	nop
361
362	ldd		[$inp + 16], %f4
363	fshiftorx	%f0, %f2, %f10, %f0
364	fshiftorx	%f2, %f4, %f10, %f2
365
366.L128aligned:
367	mov		10, $bits
368	and		$inc, `10*16`, $tmp
369	st		$bits, [$out + 240]	! store rounds
370	add		$out, $tmp, $out	! start or end of key schedule
371	sllx		$inc, 4, $inc		! 16 or -16
372___
373for ($i=0; $i<10; $i++) {
374    $code.=<<___;
375	std		%f0, [$out + 0]
376	faeskeyx	%f2, `0x10+$i`, %f0
377	std		%f2, [$out + 8]
378	add		$out, $inc, $out
379	faeskeyx	%f0, 0x00, %f2
380___
381}
382$code.=<<___;
383	std		%f0, [$out + 0]
384	std		%f2, [$out + 8]
385	retl
386	xor		%o0, %o0, %o0		! return 0
387.type	aes_fx_set_encrypt_key,#function
388.size	aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
389___
390}
391{
392my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
393my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
394my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
395   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
396my ($ileft,$iright) = ($ialign,$oalign);
397
398$code.=<<___;
399.globl	aes_fx_cbc_encrypt
400.align	32
401aes_fx_cbc_encrypt:
402	save		%sp, -STACK_FRAME-16, %sp
403	srln		$len, 4, $len
404	and		$inp, 7, $ialign
405	andn		$inp, 7, $inp
406	brz,pn		$len, .Lcbc_no_data
407	sll		$ialign, 3, $ileft
408
4091:	call		.+8
410	add		%o7, .Linp_align-1b, %o7
411
412	ld		[$key + 240], $rounds
413	and		$out, 7, $oalign
414	ld		[$ivp + 0], %f0		! load ivec
415	andn		$out, 7, $out
416	ld		[$ivp + 4], %f1
417	sll		$oalign, 3, $mask
418	ld		[$ivp + 8], %f2
419	ld		[$ivp + 12], %f3
420
421	sll		$rounds, 4, $rounds
422	add		$rounds, $key, $end
423	ldd		[$key + 0], $r0hi	! round[0]
424	ldd		[$key + 8], $r0lo
425
426	add		$inp, 16, $inp
427	sub		$len,  1, $len
428	ldd		[$end + 0], $rlhi	! round[last]
429	ldd		[$end + 8], $rllo
430
431	mov		16, $inc
432	movrz		$len, 0, $inc
433	ldd		[$key + 16], %f10	! round[1]
434	ldd		[$key + 24], %f12
435
436	ldd		[%o7 + $ileft], $fshift	! shift left params
437	add		%o7, 64, %o7
438	ldd		[$inp - 16], $in0	! load input
439	ldd		[$inp -  8], $in1
440	ldda		[$inp]0x82, $intail	! non-faulting load
441	brz		$dir, .Lcbc_decrypt
442	add		$inp, $inc, $inp	! inp+=16
443
444	fxor		$r0hi, %f0, %f0		! ivec^=round[0]
445	fxor		$r0lo, %f2, %f2
446	fshiftorx	$in0, $in1, $fshift, $in0
447	fshiftorx	$in1, $intail, $fshift, $in1
448	nop
449
450.Loop_cbc_enc:
451	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
452	fxor		$in1, %f2, %f2
453	ldd		[$key + 32], %f6	! round[2]
454	ldd		[$key + 40], %f8
455	add		$key, 32, $end
456	sub		$rounds, 16*6, $inner
457
458.Lcbc_enc:
459	fmovd		%f0, %f4
460	faesencx	%f2, %f10, %f0
461	faesencx	%f4, %f12, %f2
462	ldd		[$end + 16], %f10
463	ldd		[$end + 24], %f12
464	add		$end, 32, $end
465
466	fmovd		%f0, %f4
467	faesencx	%f2, %f6, %f0
468	faesencx	%f4, %f8, %f2
469	ldd		[$end + 0], %f6
470	ldd		[$end + 8], %f8
471
472	brnz,a		$inner, .Lcbc_enc
473	sub		$inner, 16*2, $inner
474
475	fmovd		%f0, %f4
476	faesencx	%f2, %f10, %f0
477	faesencx	%f4, %f12, %f2
478	ldd		[$end + 16], %f10	! round[last-1]
479	ldd		[$end + 24], %f12
480
481	movrz		$len, 0, $inc
482	fmovd		$intail, $in0
483	ldd		[$inp - 8], $in1	! load next input block
484	ldda		[$inp]0x82, $intail	! non-faulting load
485	add		$inp, $inc, $inp	! inp+=16
486
487	fmovd		%f0, %f4
488	faesencx	%f2, %f6, %f0
489	faesencx	%f4, %f8, %f2
490
491	fshiftorx	$in0, $in1, $fshift, $in0
492	fshiftorx	$in1, $intail, $fshift, $in1
493
494	fmovd		%f0, %f4
495	faesencx	%f2, %f10, %f0
496	faesencx	%f4, %f12, %f2
497	ldd		[$key + 16], %f10	! round[1]
498	ldd		[$key + 24], %f12
499
500	fxor		$r0hi, $in0, $in0	! inp^=round[0]
501	fxor		$r0lo, $in1, $in1
502
503	fmovd		%f0, %f4
504	faesenclx	%f2, $rlhi, %f0
505	faesenclx	%f4, $rllo, %f2
506
507	brnz,pn		$oalign, .Lcbc_enc_unaligned_out
508	nop
509
510	std		%f0, [$out + 0]
511	std		%f2, [$out + 8]
512	add		$out, 16, $out
513
514	brnz,a		$len, .Loop_cbc_enc
515	sub		$len, 1, $len
516
517	st		%f0, [$ivp + 0]		! output ivec
518	st		%f1, [$ivp + 4]
519	st		%f2, [$ivp + 8]
520	st		%f3, [$ivp + 12]
521
522.Lcbc_no_data:
523	ret
524	restore
525
526.align	32
527.Lcbc_enc_unaligned_out:
528	ldd		[%o7 + $mask], $fshift	! shift right params
529	mov		0xff, $mask
530	srl		$mask, $oalign, $mask
531	sub		%g0, $ileft, $iright
532
533	fshiftorx	%f0, %f0, $fshift, %f6
534	fshiftorx	%f0, %f2, $fshift, %f8
535
536	stda		%f6, [$out + $mask]0xc0	! partial store
537	orn		%g0, $mask, $mask
538	std		%f8, [$out + 8]
539	add		$out, 16, $out
540	brz		$len, .Lcbc_enc_unaligned_out_done
541	sub		$len, 1, $len
542	b		.Loop_cbc_enc_unaligned_out
543	nop
544
545.align	32
546.Loop_cbc_enc_unaligned_out:
547	fmovd		%f2, $outhead
548	fxor		$in0, %f0, %f0		! inp^ivec^round[0]
549	fxor		$in1, %f2, %f2
550	ldd		[$key + 32], %f6	! round[2]
551	ldd		[$key + 40], %f8
552
553	fmovd		%f0, %f4
554	faesencx	%f2, %f10, %f0
555	faesencx	%f4, %f12, %f2
556	ldd		[$key + 48], %f10	! round[3]
557	ldd		[$key + 56], %f12
558
559	ldx		[$inp - 16], %o0
560	ldx		[$inp -  8], %o1
561	brz		$ileft, .Lcbc_enc_aligned_inp
562	movrz		$len, 0, $inc
563
564	ldx		[$inp], %o2
565	sllx		%o0, $ileft, %o0
566	srlx		%o1, $iright, %g1
567	sllx		%o1, $ileft, %o1
568	or		%g1, %o0, %o0
569	srlx		%o2, $iright, %o2
570	or		%o2, %o1, %o1
571
572.Lcbc_enc_aligned_inp:
573	fmovd		%f0, %f4
574	faesencx	%f2, %f6, %f0
575	faesencx	%f4, %f8, %f2
576	ldd		[$key + 64], %f6	! round[4]
577	ldd		[$key + 72], %f8
578	add		$key, 64, $end
579	sub		$rounds, 16*8, $inner
580
581	stx		%o0, [%sp + LOCALS + 0]
582	stx		%o1, [%sp + LOCALS + 8]
583	add		$inp, $inc, $inp	! inp+=16
584	nop
585
586.Lcbc_enc_unaligned:
587	fmovd		%f0, %f4
588	faesencx	%f2, %f10, %f0
589	faesencx	%f4, %f12, %f2
590	ldd		[$end + 16], %f10
591	ldd		[$end + 24], %f12
592	add		$end, 32, $end
593
594	fmovd		%f0, %f4
595	faesencx	%f2, %f6, %f0
596	faesencx	%f4, %f8, %f2
597	ldd		[$end + 0], %f6
598	ldd		[$end + 8], %f8
599
600	brnz,a		$inner, .Lcbc_enc_unaligned
601	sub		$inner, 16*2, $inner
602
603	fmovd		%f0, %f4
604	faesencx	%f2, %f10, %f0
605	faesencx	%f4, %f12, %f2
606	ldd		[$end + 16], %f10	! round[last-1]
607	ldd		[$end + 24], %f12
608
609	fmovd		%f0, %f4
610	faesencx	%f2, %f6, %f0
611	faesencx	%f4, %f8, %f2
612
613	ldd		[%sp + LOCALS + 0], $in0
614	ldd		[%sp + LOCALS + 8], $in1
615
616	fmovd		%f0, %f4
617	faesencx	%f2, %f10, %f0
618	faesencx	%f4, %f12, %f2
619	ldd		[$key + 16], %f10	! round[1]
620	ldd		[$key + 24], %f12
621
622	fxor		$r0hi, $in0, $in0	! inp^=round[0]
623	fxor		$r0lo, $in1, $in1
624
625	fmovd		%f0, %f4
626	faesenclx	%f2, $rlhi, %f0
627	faesenclx	%f4, $rllo, %f2
628
629	fshiftorx	$outhead, %f0, $fshift, %f6
630	fshiftorx	%f0, %f2, $fshift, %f8
631	std		%f6, [$out + 0]
632	std		%f8, [$out + 8]
633	add		$out, 16, $out
634
635	brnz,a		$len, .Loop_cbc_enc_unaligned_out
636	sub		$len, 1, $len
637
638.Lcbc_enc_unaligned_out_done:
639	fshiftorx	%f2, %f2, $fshift, %f8
640	stda		%f8, [$out + $mask]0xc0	! partial store
641
642	st		%f0, [$ivp + 0]		! output ivec
643	st		%f1, [$ivp + 4]
644	st		%f2, [$ivp + 8]
645	st		%f3, [$ivp + 12]
646
647	ret
648	restore
649
650.align	32
651.Lcbc_decrypt:
652	fshiftorx	$in0, $in1, $fshift, $in0
653	fshiftorx	$in1, $intail, $fshift, $in1
654	fmovd		%f0, $iv0
655	fmovd		%f2, $iv1
656
657.Loop_cbc_dec:
658	fxor		$in0, $r0hi, %f0	! inp^round[0]
659	fxor		$in1, $r0lo, %f2
660	ldd		[$key + 32], %f6	! round[2]
661	ldd		[$key + 40], %f8
662	add		$key, 32, $end
663	sub		$rounds, 16*6, $inner
664
665.Lcbc_dec:
666	fmovd		%f0, %f4
667	faesdecx	%f2, %f10, %f0
668	faesdecx	%f4, %f12, %f2
669	ldd		[$end + 16], %f10
670	ldd		[$end + 24], %f12
671	add		$end, 32, $end
672
673	fmovd		%f0, %f4
674	faesdecx	%f2, %f6, %f0
675	faesdecx	%f4, %f8, %f2
676	ldd		[$end + 0], %f6
677	ldd		[$end + 8], %f8
678
679	brnz,a		$inner, .Lcbc_dec
680	sub		$inner, 16*2, $inner
681
682	fmovd		%f0, %f4
683	faesdecx	%f2, %f10, %f0
684	faesdecx	%f4, %f12, %f2
685	ldd		[$end + 16], %f10	! round[last-1]
686	ldd		[$end + 24], %f12
687
688	fmovd		%f0, %f4
689	faesdecx	%f2, %f6, %f0
690	faesdecx	%f4, %f8, %f2
691	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
692	fxor		$iv1, $rllo, %f8
693	fmovd		$in0, $iv0
694	fmovd		$in1, $iv1
695
696	movrz		$len, 0, $inc
697	fmovd		$intail, $in0
698	ldd		[$inp - 8], $in1	! load next input block
699	ldda		[$inp]0x82, $intail	! non-faulting load
700	add		$inp, $inc, $inp	! inp+=16
701
702	fmovd		%f0, %f4
703	faesdecx	%f2, %f10, %f0
704	faesdecx	%f4, %f12, %f2
705	ldd		[$key + 16], %f10	! round[1]
706	ldd		[$key + 24], %f12
707
708	fshiftorx	$in0, $in1, $fshift, $in0
709	fshiftorx	$in1, $intail, $fshift, $in1
710
711	fmovd		%f0, %f4
712	faesdeclx	%f2, %f6, %f0
713	faesdeclx	%f4, %f8, %f2
714
715	brnz,pn		$oalign, .Lcbc_dec_unaligned_out
716	nop
717
718	std		%f0, [$out + 0]
719	std		%f2, [$out + 8]
720	add		$out, 16, $out
721
722	brnz,a		$len, .Loop_cbc_dec
723	sub		$len, 1, $len
724
725	st		$iv0,    [$ivp + 0]	! output ivec
726	st		$iv0#lo, [$ivp + 4]
727	st		$iv1,    [$ivp + 8]
728	st		$iv1#lo, [$ivp + 12]
729
730	ret
731	restore
732
733.align	32
734.Lcbc_dec_unaligned_out:
735	ldd		[%o7 + $mask], $fshift	! shift right params
736	mov		0xff, $mask
737	srl		$mask, $oalign, $mask
738	sub		%g0, $ileft, $iright
739
740	fshiftorx	%f0, %f0, $fshift, %f6
741	fshiftorx	%f0, %f2, $fshift, %f8
742
743	stda		%f6, [$out + $mask]0xc0	! partial store
744	orn		%g0, $mask, $mask
745	std		%f8, [$out + 8]
746	add		$out, 16, $out
747	brz		$len, .Lcbc_dec_unaligned_out_done
748	sub		$len, 1, $len
749	b		.Loop_cbc_dec_unaligned_out
750	nop
751
752.align	32
753.Loop_cbc_dec_unaligned_out:
754	fmovd		%f2, $outhead
755	fxor		$in0, $r0hi, %f0	! inp^round[0]
756	fxor		$in1, $r0lo, %f2
757	ldd		[$key + 32], %f6	! round[2]
758	ldd		[$key + 40], %f8
759
760	fmovd		%f0, %f4
761	faesdecx	%f2, %f10, %f0
762	faesdecx	%f4, %f12, %f2
763	ldd		[$key + 48], %f10	! round[3]
764	ldd		[$key + 56], %f12
765
766	ldx		[$inp - 16], %o0
767	ldx		[$inp - 8], %o1
768	brz		$ileft, .Lcbc_dec_aligned_inp
769	movrz		$len, 0, $inc
770
771	ldx		[$inp], %o2
772	sllx		%o0, $ileft, %o0
773	srlx		%o1, $iright, %g1
774	sllx		%o1, $ileft, %o1
775	or		%g1, %o0, %o0
776	srlx		%o2, $iright, %o2
777	or		%o2, %o1, %o1
778
779.Lcbc_dec_aligned_inp:
780	fmovd		%f0, %f4
781	faesdecx	%f2, %f6, %f0
782	faesdecx	%f4, %f8, %f2
783	ldd		[$key + 64], %f6	! round[4]
784	ldd		[$key + 72], %f8
785	add		$key, 64, $end
786	sub		$rounds, 16*8, $inner
787
788	stx		%o0, [%sp + LOCALS + 0]
789	stx		%o1, [%sp + LOCALS + 8]
790	add		$inp, $inc, $inp	! inp+=16
791	nop
792
793.Lcbc_dec_unaligned:
794	fmovd		%f0, %f4
795	faesdecx	%f2, %f10, %f0
796	faesdecx	%f4, %f12, %f2
797	ldd		[$end + 16], %f10
798	ldd		[$end + 24], %f12
799	add		$end, 32, $end
800
801	fmovd		%f0, %f4
802	faesdecx	%f2, %f6, %f0
803	faesdecx	%f4, %f8, %f2
804	ldd		[$end + 0], %f6
805	ldd		[$end + 8], %f8
806
807	brnz,a		$inner, .Lcbc_dec_unaligned
808	sub		$inner, 16*2, $inner
809
810	fmovd		%f0, %f4
811	faesdecx	%f2, %f10, %f0
812	faesdecx	%f4, %f12, %f2
813	ldd		[$end + 16], %f10	! round[last-1]
814	ldd		[$end + 24], %f12
815
816	fmovd		%f0, %f4
817	faesdecx	%f2, %f6, %f0
818	faesdecx	%f4, %f8, %f2
819
820	fxor		$iv0, $rlhi, %f6	! ivec^round[last]
821	fxor		$iv1, $rllo, %f8
822	fmovd		$in0, $iv0
823	fmovd		$in1, $iv1
824	ldd		[%sp + LOCALS + 0], $in0
825	ldd		[%sp + LOCALS + 8], $in1
826
827	fmovd		%f0, %f4
828	faesdecx	%f2, %f10, %f0
829	faesdecx	%f4, %f12, %f2
830	ldd		[$key + 16], %f10	! round[1]
831	ldd		[$key + 24], %f12
832
833	fmovd		%f0, %f4
834	faesdeclx	%f2, %f6, %f0
835	faesdeclx	%f4, %f8, %f2
836
837	fshiftorx	$outhead, %f0, $fshift, %f6
838	fshiftorx	%f0, %f2, $fshift, %f8
839	std		%f6, [$out + 0]
840	std		%f8, [$out + 8]
841	add		$out, 16, $out
842
843	brnz,a		$len, .Loop_cbc_dec_unaligned_out
844	sub		$len, 1, $len
845
846.Lcbc_dec_unaligned_out_done:
847	fshiftorx	%f2, %f2, $fshift, %f8
848	stda		%f8, [$out + $mask]0xc0	! partial store
849
850	st		$iv0,    [$ivp + 0]	! output ivec
851	st		$iv0#lo, [$ivp + 4]
852	st		$iv1,    [$ivp + 8]
853	st		$iv1#lo, [$ivp + 12]
854
855	ret
856	restore
857.type	aes_fx_cbc_encrypt,#function
858.size	aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
859___
860}
861{
862my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
863my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
864my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
865   = map("%f$_",grep { !($_ & 1) } (16 .. 62));
866my ($ileft,$iright) = ($ialign, $oalign);
867my $one = "%f14";
868
869$code.=<<___;
870.globl	aes_fx_ctr32_encrypt_blocks
871.align	32
872aes_fx_ctr32_encrypt_blocks:
873	save		%sp, -STACK_FRAME-16, %sp
874	srln		$len, 0, $len
875	and		$inp, 7, $ialign
876	andn		$inp, 7, $inp
877	brz,pn		$len, .Lctr32_no_data
878	sll		$ialign, 3, $ileft
879
880.Lpic:	call		.+8
881	add		%o7, .Linp_align - .Lpic, %o7
882
883	ld		[$key + 240], $rounds
884	and		$out, 7, $oalign
885	ld		[$ivp +  0], $ctr0	! load counter
886	andn		$out, 7, $out
887	ld		[$ivp +  4], $ctr0#lo
888	sll		$oalign, 3, $mask
889	ld		[$ivp +  8], $ctr1
890	ld		[$ivp + 12], $ctr1#lo
891	ldd		[%o7 + 128], $one
892
893	sll		$rounds, 4, $rounds
894	add		$rounds, $key, $end
895	ldd		[$key + 0], $r0hi	! round[0]
896	ldd		[$key + 8], $r0lo
897
898	add		$inp, 16, $inp
899	sub		$len, 1, $len
900	ldd		[$key + 16], %f10	! round[1]
901	ldd		[$key + 24], %f12
902
903	mov		16, $inc
904	movrz		$len, 0, $inc
905	ldd		[$end + 0], $rlhi	! round[last]
906	ldd		[$end + 8], $rllo
907
908	ldd		[%o7 + $ileft], $fshift	! shiftleft params
909	add		%o7, 64, %o7
910	ldd		[$inp - 16], $in0	! load input
911	ldd		[$inp -  8], $in1
912	ldda		[$inp]0x82, $intail	! non-faulting load
913	add		$inp, $inc, $inp	! inp+=16
914
915	fshiftorx	$in0, $in1, $fshift, $in0
916	fshiftorx	$in1, $intail, $fshift, $in1
917
918.Loop_ctr32:
919	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
920	fxor		$ctr1, $r0lo, %f2
921	ldd		[$key + 32], %f6	! round[2]
922	ldd		[$key + 40], %f8
923	add		$key, 32, $end
924	sub		$rounds, 16*6, $inner
925
926.Lctr32_enc:
927	fmovd		%f0, %f4
928	faesencx	%f2, %f10, %f0
929	faesencx	%f4, %f12, %f2
930	ldd		[$end + 16], %f10
931	ldd		[$end + 24], %f12
932	add		$end, 32, $end
933
934	fmovd		%f0, %f4
935	faesencx	%f2, %f6, %f0
936	faesencx	%f4, %f8, %f2
937	ldd		[$end + 0], %f6
938	ldd		[$end + 8], %f8
939
940	brnz,a		$inner, .Lctr32_enc
941	sub		$inner, 16*2, $inner
942
943	fmovd		%f0, %f4
944	faesencx	%f2, %f10, %f0
945	faesencx	%f4, %f12, %f2
946	ldd		[$end + 16], %f10	! round[last-1]
947	ldd		[$end + 24], %f12
948
949	fmovd		%f0, %f4
950	faesencx	%f2, %f6, %f0
951	faesencx	%f4, %f8, %f2
952	fxor		$in0, $rlhi, %f6	! inp^round[last]
953	fxor		$in1, $rllo, %f8
954
955	movrz		$len, 0, $inc
956	fmovd		$intail, $in0
957	ldd		[$inp - 8], $in1	! load next input block
958	ldda		[$inp]0x82, $intail	! non-faulting load
959	add		$inp, $inc, $inp	! inp+=16
960
961	fmovd		%f0, %f4
962	faesencx	%f2, %f10, %f0
963	faesencx	%f4, %f12, %f2
964	ldd		[$key + 16], %f10	! round[1]
965	ldd		[$key + 24], %f12
966
967	fshiftorx	$in0, $in1, $fshift, $in0
968	fshiftorx	$in1, $intail, $fshift, $in1
969	fpadd32		$ctr1, $one, $ctr1	! increment counter
970
971	fmovd		%f0, %f4
972	faesenclx	%f2, %f6, %f0
973	faesenclx	%f4, %f8, %f2
974
975	brnz,pn		$oalign, .Lctr32_unaligned_out
976	nop
977
978	std		%f0, [$out + 0]
979	std		%f2, [$out + 8]
980	add		$out, 16, $out
981
982	brnz,a		$len, .Loop_ctr32
983	sub		$len, 1, $len
984
985.Lctr32_no_data:
986	ret
987	restore
988
989.align	32
990.Lctr32_unaligned_out:
991	ldd		[%o7 + $mask], $fshift	! shift right params
992	mov		0xff, $mask
993	srl		$mask, $oalign, $mask
994	sub		%g0, $ileft, $iright
995
996	fshiftorx	%f0, %f0, $fshift, %f6
997	fshiftorx	%f0, %f2, $fshift, %f8
998
999	stda		%f6, [$out + $mask]0xc0	! partial store
1000	orn		%g0, $mask, $mask
1001	std		%f8, [$out + 8]
1002	add		$out, 16, $out
1003	brz		$len, .Lctr32_unaligned_out_done
1004	sub		$len, 1, $len
1005	b		.Loop_ctr32_unaligned_out
1006	nop
1007
1008.align	32
1009.Loop_ctr32_unaligned_out:
1010	fmovd		%f2, $outhead
1011	fxor		$ctr0, $r0hi, %f0	! counter^round[0]
1012	fxor		$ctr1, $r0lo, %f2
1013	ldd		[$key + 32], %f6	! round[2]
1014	ldd		[$key + 40], %f8
1015
1016	fmovd		%f0, %f4
1017	faesencx	%f2, %f10, %f0
1018	faesencx	%f4, %f12, %f2
1019	ldd		[$key + 48], %f10	! round[3]
1020	ldd		[$key + 56], %f12
1021
1022	ldx		[$inp - 16], %o0
1023	ldx		[$inp -  8], %o1
1024	brz		$ileft, .Lctr32_aligned_inp
1025	movrz		$len, 0, $inc
1026
1027	ldx		[$inp], %o2
1028	sllx		%o0, $ileft, %o0
1029	srlx		%o1, $iright, %g1
1030	sllx		%o1, $ileft, %o1
1031	or		%g1, %o0, %o0
1032	srlx		%o2, $iright, %o2
1033	or		%o2, %o1, %o1
1034
1035.Lctr32_aligned_inp:
1036	fmovd		%f0, %f4
1037	faesencx	%f2, %f6, %f0
1038	faesencx	%f4, %f8, %f2
1039	ldd		[$key + 64], %f6	! round[4]
1040	ldd		[$key + 72], %f8
1041	add		$key, 64, $end
1042	sub		$rounds, 16*8, $inner
1043
1044	stx		%o0, [%sp + LOCALS + 0]
1045	stx		%o1, [%sp + LOCALS + 8]
1046	add		$inp, $inc, $inp	! inp+=16
1047	nop
1048
1049.Lctr32_enc_unaligned:
1050	fmovd		%f0, %f4
1051	faesencx	%f2, %f10, %f0
1052	faesencx	%f4, %f12, %f2
1053	ldd		[$end + 16], %f10
1054	ldd		[$end + 24], %f12
1055	add		$end, 32, $end
1056
1057	fmovd		%f0, %f4
1058	faesencx	%f2, %f6, %f0
1059	faesencx	%f4, %f8, %f2
1060	ldd		[$end + 0], %f6
1061	ldd		[$end + 8], %f8
1062
1063	brnz,a		$inner, .Lctr32_enc_unaligned
1064	sub		$inner, 16*2, $inner
1065
1066	fmovd		%f0, %f4
1067	faesencx	%f2, %f10, %f0
1068	faesencx	%f4, %f12, %f2
1069	ldd		[$end + 16], %f10	! round[last-1]
1070	ldd		[$end + 24], %f12
1071	fpadd32		$ctr1, $one, $ctr1	! increment counter
1072
1073	fmovd		%f0, %f4
1074	faesencx	%f2, %f6, %f0
1075	faesencx	%f4, %f8, %f2
1076	fxor		$in0, $rlhi, %f6	! inp^round[last]
1077	fxor		$in1, $rllo, %f8
1078	ldd		[%sp + LOCALS + 0], $in0
1079	ldd		[%sp + LOCALS + 8], $in1
1080
1081	fmovd		%f0, %f4
1082	faesencx	%f2, %f10, %f0
1083	faesencx	%f4, %f12, %f2
1084	ldd		[$key + 16], %f10	! round[1]
1085	ldd		[$key + 24], %f12
1086
1087	fmovd		%f0, %f4
1088	faesenclx	%f2, %f6, %f0
1089	faesenclx	%f4, %f8, %f2
1090
1091	fshiftorx	$outhead, %f0, $fshift, %f6
1092	fshiftorx	%f0, %f2, $fshift, %f8
1093	std		%f6, [$out + 0]
1094	std		%f8, [$out + 8]
1095	add		$out, 16, $out
1096
1097	brnz,a		$len, .Loop_ctr32_unaligned_out
1098	sub		$len, 1, $len
1099
1100.Lctr32_unaligned_out_done:
1101	fshiftorx	%f2, %f2, $fshift, %f8
1102	stda		%f8, [$out + $mask]0xc0	! partial store
1103
1104	ret
1105	restore
1106.type	aes_fx_ctr32_encrypt_blocks,#function
1107.size	aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1108
1109.align	32
1110.Linp_align:		! fshiftorx parameters for left shift toward %rs1
1111	.byte	0, 0, 64,  0,	0, 64,  0, -64
1112	.byte	0, 0, 56,  8,	0, 56,  8, -56
1113	.byte	0, 0, 48, 16,	0, 48, 16, -48
1114	.byte	0, 0, 40, 24,	0, 40, 24, -40
1115	.byte	0, 0, 32, 32,	0, 32, 32, -32
1116	.byte	0, 0, 24, 40,	0, 24, 40, -24
1117	.byte	0, 0, 16, 48,	0, 16, 48, -16
1118	.byte	0, 0,  8, 56,	0,  8, 56, -8
1119.Lout_align:		! fshiftorx parameters for right shift toward %rs2
1120	.byte	0, 0,  0, 64,	0,  0, 64,   0
1121	.byte	0, 0,  8, 56,	0,  8, 56,  -8
1122	.byte	0, 0, 16, 48,	0, 16, 48, -16
1123	.byte	0, 0, 24, 40,	0, 24, 40, -24
1124	.byte	0, 0, 32, 32,	0, 32, 32, -32
1125	.byte	0, 0, 40, 24,	0, 40, 24, -40
1126	.byte	0, 0, 48, 16,	0, 48, 16, -48
1127	.byte	0, 0, 56,  8,	0, 56,  8, -56
1128.Lone:
1129	.word	0, 1
1130.asciz	"AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1131.align	4
1132___
1133}
1134# Purpose of these subroutines is to explicitly encode VIS instructions,
1135# so that one can compile the module without having to specify VIS
1136# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1137# Idea is to reserve for option to produce "universal" binary and let
1138# programmer detect if current CPU is VIS capable at run-time.
1139sub unvis {
1140my ($mnemonic,$rs1,$rs2,$rd)=@_;
1141my ($ref,$opf);
1142my %visopf = (	"faligndata"	=> 0x048,
1143		"bshuffle"	=> 0x04c,
1144		"fpadd32"	=> 0x052,
1145		"fxor"		=> 0x06c,
1146		"fsrc2"		=> 0x078	);
1147
1148    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1149
1150    if ($opf=$visopf{$mnemonic}) {
1151	foreach ($rs1,$rs2,$rd) {
1152	    return $ref if (!/%f([0-9]{1,2})/);
1153	    $_=$1;
1154	    if ($1>=32) {
1155		return $ref if ($1&1);
1156		# re-encode for upper double register addressing
1157		$_=($1|$1>>5)&31;
1158	    }
1159	}
1160
1161	return	sprintf ".word\t0x%08x !%s",
1162			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1163			$ref;
1164    } else {
1165	return $ref;
1166    }
1167}
1168
1169sub unvis3 {
1170my ($mnemonic,$rs1,$rs2,$rd)=@_;
1171my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1172my ($ref,$opf);
1173my %visopf = (	"alignaddr"	=> 0x018,
1174		"bmask"		=> 0x019,
1175		"alignaddrl"	=> 0x01a	);
1176
1177    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1178
1179    if ($opf=$visopf{$mnemonic}) {
1180	foreach ($rs1,$rs2,$rd) {
1181	    return $ref if (!/%([goli])([0-9])/);
1182	    $_=$bias{$1}+$2;
1183	}
1184
1185	return	sprintf ".word\t0x%08x !%s",
1186			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1187			$ref;
1188    } else {
1189	return $ref;
1190    }
1191}
1192
1193sub unfx {
1194my ($mnemonic,$rs1,$rs2,$rd)=@_;
1195my ($ref,$opf);
1196my %aesopf = (	"faesencx"	=> 0x90,
1197		"faesdecx"	=> 0x91,
1198		"faesenclx"	=> 0x92,
1199		"faesdeclx"	=> 0x93,
1200		"faeskeyx"	=> 0x94	);
1201
1202    $ref = "$mnemonic\t$rs1,$rs2,$rd";
1203
1204    if (defined($opf=$aesopf{$mnemonic})) {
1205	$rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
1206	$rs2 = oct($rs2) if ($rs2 =~ /^0/);
1207
1208	foreach ($rs1,$rd) {
1209	    return $ref if (!/%f([0-9]{1,2})/);
1210	    $_=$1;
1211	    if ($1>=32) {
1212		return $ref if ($1&1);
1213		# re-encode for upper double register addressing
1214		$_=($1|$1>>5)&31;
1215	    }
1216	}
1217
1218	return	sprintf ".word\t0x%08x !%s",
1219			2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1220			$ref;
1221    } else {
1222	return $ref;
1223    }
1224}
1225
1226sub unfx3src {
1227my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1228my ($ref,$opf);
1229my %aesopf = (	"fshiftorx"	=> 0x0b	);
1230
1231    $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1232
1233    if (defined($opf=$aesopf{$mnemonic})) {
1234	foreach ($rs1,$rs2,$rs3,$rd) {
1235	    return $ref if (!/%f([0-9]{1,2})/);
1236	    $_=$1;
1237	    if ($1>=32) {
1238		return $ref if ($1&1);
1239		# re-encode for upper double register addressing
1240		$_=($1|$1>>5)&31;
1241	    }
1242	}
1243
1244	return	sprintf ".word\t0x%08x !%s",
1245			2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1246			$ref;
1247    } else {
1248	return $ref;
1249    }
1250}
1251
1252foreach (split("\n",$code)) {
1253    s/\`([^\`]*)\`/eval $1/ge;
1254
1255    s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1256
1257    s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1258		&unfx($1,$2,$3,$4)
1259     /ge or
1260    s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1261		&unfx3src($1,$2,$3,$4,$5)
1262     /ge or
1263    s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1264		&unvis($1,$2,$3,$4)
1265     /ge or
1266    s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1267		&unvis3($1,$2,$3,$4)
1268     /ge;
1269    print $_,"\n";
1270}
1271
1272close STDOUT or die "error closing STDOUT: $!";
1273