• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# April 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that it
21# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
22# it processes one byte in 19.6 cycles, which is more than twice as
23# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
24# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
25# processed byte. This is ~2.2x faster than 64-bit code generated by
26# vendor compiler (which used to be very hard to beat:-).
27#
28# Special thanks to polarhome.com for providing HP-UX account.
29
30$flavour = shift;
31$output = shift;
32open STDOUT,">$output";
33
34if ($flavour =~ /64/) {
35	$LEVEL		="2.0W";
36	$SIZE_T		=8;
37	$FRAME_MARKER	=80;
38	$SAVED_RP	=16;
39	$PUSH		="std";
40	$PUSHMA		="std,ma";
41	$POP		="ldd";
42	$POPMB		="ldd,mb";
43	$NREGS		=6;
44} else {
45	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
46	$SIZE_T		=4;
47	$FRAME_MARKER	=48;
48	$SAVED_RP	=20;
49	$PUSH		="stw";
50	$PUSHMA		="stwm";
51	$POP		="ldw";
52	$POPMB		="ldwm";
53	$NREGS		=11;
54}
55
56$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
57				#                 [+ argument transfer]
58
59################# volatile registers
60$Xi="%r26";	# argument block
61$Htbl="%r25";
62$inp="%r24";
63$len="%r23";
64$Hhh=$Htbl;	# variables
65$Hll="%r22";
66$Zhh="%r21";
67$Zll="%r20";
68$cnt="%r19";
69$rem_4bit="%r28";
70$rem="%r29";
71$mask0xf0="%r31";
72
73################# preserved registers
74$Thh="%r1";
75$Tll="%r2";
76$nlo="%r3";
77$nhi="%r4";
78$byte="%r5";
79if ($SIZE_T==4) {
80	$Zhl="%r6";
81	$Zlh="%r7";
82	$Hhl="%r8";
83	$Hlh="%r9";
84	$Thl="%r10";
85	$Tlh="%r11";
86}
87$rem2="%r6";	# used in PA-RISC 2.0 code
88
89$code.=<<___;
90	.LEVEL	$LEVEL
91	.SPACE	\$TEXT\$
92	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
93
94	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
95	.ALIGN	64
96gcm_gmult_4bit
97	.PROC
98	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
99	.ENTRY
100	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
101	$PUSHMA	%r3,$FRAME(%sp)
102	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
103	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
104	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
105___
106$code.=<<___ if ($SIZE_T==4);
107	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
108	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
109	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
110	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
111	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
112___
113$code.=<<___;
114	blr	%r0,$rem_4bit
115	ldi	3,$rem
116L\$pic_gmult
117	andcm	$rem_4bit,$rem,$rem_4bit
118	addl	$inp,$len,$len
119	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
120	ldi	0xf0,$mask0xf0
121___
122$code.=<<___ if ($SIZE_T==4);
123	ldi	31,$rem
124	mtctl	$rem,%cr11
125	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
126	b	L\$parisc1_gmult
127	nop
128___
129
130$code.=<<___;
131	ldb	15($Xi),$nlo
132	ldo	8($Htbl),$Hll
133
134	and	$mask0xf0,$nlo,$nhi
135	depd,z	$nlo,59,4,$nlo
136
137	ldd	$nlo($Hll),$Zll
138	ldd	$nlo($Hhh),$Zhh
139
140	depd,z	$Zll,60,4,$rem
141	shrpd	$Zhh,$Zll,4,$Zll
142	extrd,u	$Zhh,59,60,$Zhh
143	ldb	14($Xi),$nlo
144
145	ldd	$nhi($Hll),$Tll
146	ldd	$nhi($Hhh),$Thh
147	and	$mask0xf0,$nlo,$nhi
148	depd,z	$nlo,59,4,$nlo
149
150	xor	$Tll,$Zll,$Zll
151	xor	$Thh,$Zhh,$Zhh
152	ldd	$rem($rem_4bit),$rem
153	b	L\$oop_gmult_pa2
154	ldi	13,$cnt
155
156	.ALIGN	8
157L\$oop_gmult_pa2
158	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
159	depd,z	$Zll,60,4,$rem
160
161	shrpd	$Zhh,$Zll,4,$Zll
162	extrd,u	$Zhh,59,60,$Zhh
163	ldd	$nlo($Hll),$Tll
164	ldd	$nlo($Hhh),$Thh
165
166	xor	$Tll,$Zll,$Zll
167	xor	$Thh,$Zhh,$Zhh
168	ldd	$rem($rem_4bit),$rem
169
170	xor	$rem,$Zhh,$Zhh
171	depd,z	$Zll,60,4,$rem
172	ldbx	$cnt($Xi),$nlo
173
174	shrpd	$Zhh,$Zll,4,$Zll
175	extrd,u	$Zhh,59,60,$Zhh
176	ldd	$nhi($Hll),$Tll
177	ldd	$nhi($Hhh),$Thh
178
179	and	$mask0xf0,$nlo,$nhi
180	depd,z	$nlo,59,4,$nlo
181	ldd	$rem($rem_4bit),$rem
182
183	xor	$Tll,$Zll,$Zll
184	addib,uv -1,$cnt,L\$oop_gmult_pa2
185	xor	$Thh,$Zhh,$Zhh
186
187	xor	$rem,$Zhh,$Zhh
188	depd,z	$Zll,60,4,$rem
189
190	shrpd	$Zhh,$Zll,4,$Zll
191	extrd,u	$Zhh,59,60,$Zhh
192	ldd	$nlo($Hll),$Tll
193	ldd	$nlo($Hhh),$Thh
194
195	xor	$Tll,$Zll,$Zll
196	xor	$Thh,$Zhh,$Zhh
197	ldd	$rem($rem_4bit),$rem
198
199	xor	$rem,$Zhh,$Zhh
200	depd,z	$Zll,60,4,$rem
201
202	shrpd	$Zhh,$Zll,4,$Zll
203	extrd,u	$Zhh,59,60,$Zhh
204	ldd	$nhi($Hll),$Tll
205	ldd	$nhi($Hhh),$Thh
206
207	xor	$Tll,$Zll,$Zll
208	xor	$Thh,$Zhh,$Zhh
209	ldd	$rem($rem_4bit),$rem
210
211	xor	$rem,$Zhh,$Zhh
212	std	$Zll,8($Xi)
213	std	$Zhh,0($Xi)
214___
215
216$code.=<<___ if ($SIZE_T==4);
217	b	L\$done_gmult
218	nop
219
220L\$parisc1_gmult
221	ldb	15($Xi),$nlo
222	ldo	12($Htbl),$Hll
223	ldo	8($Htbl),$Hlh
224	ldo	4($Htbl),$Hhl
225
226	and	$mask0xf0,$nlo,$nhi
227	zdep	$nlo,27,4,$nlo
228
229	ldwx	$nlo($Hll),$Zll
230	ldwx	$nlo($Hlh),$Zlh
231	ldwx	$nlo($Hhl),$Zhl
232	ldwx	$nlo($Hhh),$Zhh
233	zdep	$Zll,28,4,$rem
234	ldb	14($Xi),$nlo
235	ldwx	$rem($rem_4bit),$rem
236	shrpw	$Zlh,$Zll,4,$Zll
237	ldwx	$nhi($Hll),$Tll
238	shrpw	$Zhl,$Zlh,4,$Zlh
239	ldwx	$nhi($Hlh),$Tlh
240	shrpw	$Zhh,$Zhl,4,$Zhl
241	ldwx	$nhi($Hhl),$Thl
242	extru	$Zhh,27,28,$Zhh
243	ldwx	$nhi($Hhh),$Thh
244	xor	$rem,$Zhh,$Zhh
245	and	$mask0xf0,$nlo,$nhi
246	zdep	$nlo,27,4,$nlo
247
248	xor	$Tll,$Zll,$Zll
249	ldwx	$nlo($Hll),$Tll
250	xor	$Tlh,$Zlh,$Zlh
251	ldwx	$nlo($Hlh),$Tlh
252	xor	$Thl,$Zhl,$Zhl
253	b	L\$oop_gmult_pa1
254	ldi	13,$cnt
255
256	.ALIGN	8
257L\$oop_gmult_pa1
258	zdep	$Zll,28,4,$rem
259	ldwx	$nlo($Hhl),$Thl
260	xor	$Thh,$Zhh,$Zhh
261	ldwx	$rem($rem_4bit),$rem
262	shrpw	$Zlh,$Zll,4,$Zll
263	ldwx	$nlo($Hhh),$Thh
264	shrpw	$Zhl,$Zlh,4,$Zlh
265	ldbx	$cnt($Xi),$nlo
266	xor	$Tll,$Zll,$Zll
267	ldwx	$nhi($Hll),$Tll
268	shrpw	$Zhh,$Zhl,4,$Zhl
269	xor	$Tlh,$Zlh,$Zlh
270	ldwx	$nhi($Hlh),$Tlh
271	extru	$Zhh,27,28,$Zhh
272	xor	$Thl,$Zhl,$Zhl
273	ldwx	$nhi($Hhl),$Thl
274	xor	$rem,$Zhh,$Zhh
275	zdep	$Zll,28,4,$rem
276	xor	$Thh,$Zhh,$Zhh
277	ldwx	$nhi($Hhh),$Thh
278	shrpw	$Zlh,$Zll,4,$Zll
279	ldwx	$rem($rem_4bit),$rem
280	shrpw	$Zhl,$Zlh,4,$Zlh
281	shrpw	$Zhh,$Zhl,4,$Zhl
282	and	$mask0xf0,$nlo,$nhi
283	extru	$Zhh,27,28,$Zhh
284	zdep	$nlo,27,4,$nlo
285	xor	$Tll,$Zll,$Zll
286	ldwx	$nlo($Hll),$Tll
287	xor	$Tlh,$Zlh,$Zlh
288	ldwx	$nlo($Hlh),$Tlh
289	xor	$rem,$Zhh,$Zhh
290	addib,uv -1,$cnt,L\$oop_gmult_pa1
291	xor	$Thl,$Zhl,$Zhl
292
293	zdep	$Zll,28,4,$rem
294	ldwx	$nlo($Hhl),$Thl
295	xor	$Thh,$Zhh,$Zhh
296	ldwx	$rem($rem_4bit),$rem
297	shrpw	$Zlh,$Zll,4,$Zll
298	ldwx	$nlo($Hhh),$Thh
299	shrpw	$Zhl,$Zlh,4,$Zlh
300	xor	$Tll,$Zll,$Zll
301	ldwx	$nhi($Hll),$Tll
302	shrpw	$Zhh,$Zhl,4,$Zhl
303	xor	$Tlh,$Zlh,$Zlh
304	ldwx	$nhi($Hlh),$Tlh
305	extru	$Zhh,27,28,$Zhh
306	xor	$rem,$Zhh,$Zhh
307	xor	$Thl,$Zhl,$Zhl
308	ldwx	$nhi($Hhl),$Thl
309	xor	$Thh,$Zhh,$Zhh
310	ldwx	$nhi($Hhh),$Thh
311	zdep	$Zll,28,4,$rem
312	ldwx	$rem($rem_4bit),$rem
313	shrpw	$Zlh,$Zll,4,$Zll
314	shrpw	$Zhl,$Zlh,4,$Zlh
315	shrpw	$Zhh,$Zhl,4,$Zhl
316	extru	$Zhh,27,28,$Zhh
317	xor	$Tll,$Zll,$Zll
318	xor	$Tlh,$Zlh,$Zlh
319	xor	$rem,$Zhh,$Zhh
320	stw	$Zll,12($Xi)
321	xor	$Thl,$Zhl,$Zhl
322	stw	$Zlh,8($Xi)
323	xor	$Thh,$Zhh,$Zhh
324	stw	$Zhl,4($Xi)
325	stw	$Zhh,0($Xi)
326___
327$code.=<<___;
328L\$done_gmult
329	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
330	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
331	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
332	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
333___
334$code.=<<___ if ($SIZE_T==4);
335	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
336	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
337	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
338	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
339	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
340___
341$code.=<<___;
342	bv	(%r2)
343	.EXIT
344	$POPMB	-$FRAME(%sp),%r3
345	.PROCEND
346
347	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
348	.ALIGN	64
349gcm_ghash_4bit
350	.PROC
351	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
352	.ENTRY
353	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
354	$PUSHMA	%r3,$FRAME(%sp)
355	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
356	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
357	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
358___
359$code.=<<___ if ($SIZE_T==4);
360	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
361	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
362	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
363	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
364	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
365___
366$code.=<<___;
367	blr	%r0,$rem_4bit
368	ldi	3,$rem
369L\$pic_ghash
370	andcm	$rem_4bit,$rem,$rem_4bit
371	addl	$inp,$len,$len
372	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
373	ldi	0xf0,$mask0xf0
374___
375$code.=<<___ if ($SIZE_T==4);
376	ldi	31,$rem
377	mtctl	$rem,%cr11
378	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
379	b	L\$parisc1_ghash
380	nop
381___
382
383$code.=<<___;
384	ldb	15($Xi),$nlo
385	ldo	8($Htbl),$Hll
386
387L\$outer_ghash_pa2
388	ldb	15($inp),$nhi
389	xor	$nhi,$nlo,$nlo
390	and	$mask0xf0,$nlo,$nhi
391	depd,z	$nlo,59,4,$nlo
392
393	ldd	$nlo($Hll),$Zll
394	ldd	$nlo($Hhh),$Zhh
395
396	depd,z	$Zll,60,4,$rem
397	shrpd	$Zhh,$Zll,4,$Zll
398	extrd,u	$Zhh,59,60,$Zhh
399	ldb	14($Xi),$nlo
400	ldb	14($inp),$byte
401
402	ldd	$nhi($Hll),$Tll
403	ldd	$nhi($Hhh),$Thh
404	xor	$byte,$nlo,$nlo
405	and	$mask0xf0,$nlo,$nhi
406	depd,z	$nlo,59,4,$nlo
407
408	xor	$Tll,$Zll,$Zll
409	xor	$Thh,$Zhh,$Zhh
410	ldd	$rem($rem_4bit),$rem
411	b	L\$oop_ghash_pa2
412	ldi	13,$cnt
413
414	.ALIGN	8
415L\$oop_ghash_pa2
416	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
417	depd,z	$Zll,60,4,$rem2
418
419	shrpd	$Zhh,$Zll,4,$Zll
420	extrd,u	$Zhh,59,60,$Zhh
421	ldd	$nlo($Hll),$Tll
422	ldd	$nlo($Hhh),$Thh
423
424	xor	$Tll,$Zll,$Zll
425	xor	$Thh,$Zhh,$Zhh
426	ldbx	$cnt($Xi),$nlo
427	ldbx	$cnt($inp),$byte
428
429	depd,z	$Zll,60,4,$rem
430	shrpd	$Zhh,$Zll,4,$Zll
431	ldd	$rem2($rem_4bit),$rem2
432
433	xor	$rem2,$Zhh,$Zhh
434	xor	$byte,$nlo,$nlo
435	ldd	$nhi($Hll),$Tll
436	ldd	$nhi($Hhh),$Thh
437
438	and	$mask0xf0,$nlo,$nhi
439	depd,z	$nlo,59,4,$nlo
440
441	extrd,u	$Zhh,59,60,$Zhh
442	xor	$Tll,$Zll,$Zll
443
444	ldd	$rem($rem_4bit),$rem
445	addib,uv -1,$cnt,L\$oop_ghash_pa2
446	xor	$Thh,$Zhh,$Zhh
447
448	xor	$rem,$Zhh,$Zhh
449	depd,z	$Zll,60,4,$rem2
450
451	shrpd	$Zhh,$Zll,4,$Zll
452	extrd,u	$Zhh,59,60,$Zhh
453	ldd	$nlo($Hll),$Tll
454	ldd	$nlo($Hhh),$Thh
455
456	xor	$Tll,$Zll,$Zll
457	xor	$Thh,$Zhh,$Zhh
458
459	depd,z	$Zll,60,4,$rem
460	shrpd	$Zhh,$Zll,4,$Zll
461	ldd	$rem2($rem_4bit),$rem2
462
463	xor	$rem2,$Zhh,$Zhh
464	ldd	$nhi($Hll),$Tll
465	ldd	$nhi($Hhh),$Thh
466
467	extrd,u	$Zhh,59,60,$Zhh
468	xor	$Tll,$Zll,$Zll
469	xor	$Thh,$Zhh,$Zhh
470	ldd	$rem($rem_4bit),$rem
471
472	xor	$rem,$Zhh,$Zhh
473	std	$Zll,8($Xi)
474	ldo	16($inp),$inp
475	std	$Zhh,0($Xi)
476	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
477	copy	$Zll,$nlo
478___
479
480$code.=<<___ if ($SIZE_T==4);
481	b	L\$done_ghash
482	nop
483
484L\$parisc1_ghash
485	ldb	15($Xi),$nlo
486	ldo	12($Htbl),$Hll
487	ldo	8($Htbl),$Hlh
488	ldo	4($Htbl),$Hhl
489
490L\$outer_ghash_pa1
491	ldb	15($inp),$byte
492	xor	$byte,$nlo,$nlo
493	and	$mask0xf0,$nlo,$nhi
494	zdep	$nlo,27,4,$nlo
495
496	ldwx	$nlo($Hll),$Zll
497	ldwx	$nlo($Hlh),$Zlh
498	ldwx	$nlo($Hhl),$Zhl
499	ldwx	$nlo($Hhh),$Zhh
500	zdep	$Zll,28,4,$rem
501	ldb	14($Xi),$nlo
502	ldb	14($inp),$byte
503	ldwx	$rem($rem_4bit),$rem
504	shrpw	$Zlh,$Zll,4,$Zll
505	ldwx	$nhi($Hll),$Tll
506	shrpw	$Zhl,$Zlh,4,$Zlh
507	ldwx	$nhi($Hlh),$Tlh
508	shrpw	$Zhh,$Zhl,4,$Zhl
509	ldwx	$nhi($Hhl),$Thl
510	extru	$Zhh,27,28,$Zhh
511	ldwx	$nhi($Hhh),$Thh
512	xor	$byte,$nlo,$nlo
513	xor	$rem,$Zhh,$Zhh
514	and	$mask0xf0,$nlo,$nhi
515	zdep	$nlo,27,4,$nlo
516
517	xor	$Tll,$Zll,$Zll
518	ldwx	$nlo($Hll),$Tll
519	xor	$Tlh,$Zlh,$Zlh
520	ldwx	$nlo($Hlh),$Tlh
521	xor	$Thl,$Zhl,$Zhl
522	b	L\$oop_ghash_pa1
523	ldi	13,$cnt
524
525	.ALIGN	8
526L\$oop_ghash_pa1
527	zdep	$Zll,28,4,$rem
528	ldwx	$nlo($Hhl),$Thl
529	xor	$Thh,$Zhh,$Zhh
530	ldwx	$rem($rem_4bit),$rem
531	shrpw	$Zlh,$Zll,4,$Zll
532	ldwx	$nlo($Hhh),$Thh
533	shrpw	$Zhl,$Zlh,4,$Zlh
534	ldbx	$cnt($Xi),$nlo
535	xor	$Tll,$Zll,$Zll
536	ldwx	$nhi($Hll),$Tll
537	shrpw	$Zhh,$Zhl,4,$Zhl
538	ldbx	$cnt($inp),$byte
539	xor	$Tlh,$Zlh,$Zlh
540	ldwx	$nhi($Hlh),$Tlh
541	extru	$Zhh,27,28,$Zhh
542	xor	$Thl,$Zhl,$Zhl
543	ldwx	$nhi($Hhl),$Thl
544	xor	$rem,$Zhh,$Zhh
545	zdep	$Zll,28,4,$rem
546	xor	$Thh,$Zhh,$Zhh
547	ldwx	$nhi($Hhh),$Thh
548	shrpw	$Zlh,$Zll,4,$Zll
549	ldwx	$rem($rem_4bit),$rem
550	shrpw	$Zhl,$Zlh,4,$Zlh
551	xor	$byte,$nlo,$nlo
552	shrpw	$Zhh,$Zhl,4,$Zhl
553	and	$mask0xf0,$nlo,$nhi
554	extru	$Zhh,27,28,$Zhh
555	zdep	$nlo,27,4,$nlo
556	xor	$Tll,$Zll,$Zll
557	ldwx	$nlo($Hll),$Tll
558	xor	$Tlh,$Zlh,$Zlh
559	ldwx	$nlo($Hlh),$Tlh
560	xor	$rem,$Zhh,$Zhh
561	addib,uv -1,$cnt,L\$oop_ghash_pa1
562	xor	$Thl,$Zhl,$Zhl
563
564	zdep	$Zll,28,4,$rem
565	ldwx	$nlo($Hhl),$Thl
566	xor	$Thh,$Zhh,$Zhh
567	ldwx	$rem($rem_4bit),$rem
568	shrpw	$Zlh,$Zll,4,$Zll
569	ldwx	$nlo($Hhh),$Thh
570	shrpw	$Zhl,$Zlh,4,$Zlh
571	xor	$Tll,$Zll,$Zll
572	ldwx	$nhi($Hll),$Tll
573	shrpw	$Zhh,$Zhl,4,$Zhl
574	xor	$Tlh,$Zlh,$Zlh
575	ldwx	$nhi($Hlh),$Tlh
576	extru	$Zhh,27,28,$Zhh
577	xor	$rem,$Zhh,$Zhh
578	xor	$Thl,$Zhl,$Zhl
579	ldwx	$nhi($Hhl),$Thl
580	xor	$Thh,$Zhh,$Zhh
581	ldwx	$nhi($Hhh),$Thh
582	zdep	$Zll,28,4,$rem
583	ldwx	$rem($rem_4bit),$rem
584	shrpw	$Zlh,$Zll,4,$Zll
585	shrpw	$Zhl,$Zlh,4,$Zlh
586	shrpw	$Zhh,$Zhl,4,$Zhl
587	extru	$Zhh,27,28,$Zhh
588	xor	$Tll,$Zll,$Zll
589	xor	$Tlh,$Zlh,$Zlh
590	xor	$rem,$Zhh,$Zhh
591	stw	$Zll,12($Xi)
592	xor	$Thl,$Zhl,$Zhl
593	stw	$Zlh,8($Xi)
594	xor	$Thh,$Zhh,$Zhh
595	stw	$Zhl,4($Xi)
596	ldo	16($inp),$inp
597	stw	$Zhh,0($Xi)
598	comb,<>	$inp,$len,L\$outer_ghash_pa1
599	copy	$Zll,$nlo
600___
601$code.=<<___;
602L\$done_ghash
603	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
604	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
605	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
606	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
607___
608$code.=<<___ if ($SIZE_T==4);
609	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
610	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
611	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
612	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
613	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
614___
615$code.=<<___;
616	bv	(%r2)
617	.EXIT
618	$POPMB	-$FRAME(%sp),%r3
619	.PROCEND
620
621	.ALIGN	64
622L\$rem_4bit
623	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
624	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
625	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
626	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
627	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
628	.ALIGN	64
629___
630
631# Explicitly encode PA-RISC 2.0 instructions used in this module, so
632# that it can be compiled with .LEVEL 1.0. It should be noted that I
633# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
634# directive...
635
636my $ldd = sub {
637  my ($mod,$args) = @_;
638  my $orig = "ldd$mod\t$args";
639
640    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
641    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
642	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
643    }
644    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
645    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
646	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
647	$opcode|=(1<<5)  if ($mod =~ /^,m/);
648	$opcode|=(1<<13) if ($mod =~ /^,mb/);
649	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
650    }
651    else { "\t".$orig; }
652};
653
654my $std = sub {
655  my ($mod,$args) = @_;
656  my $orig = "std$mod\t$args";
657
658    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
659    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
660	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
661    }
662    else { "\t".$orig; }
663};
664
665my $extrd = sub {
666  my ($mod,$args) = @_;
667  my $orig = "extrd$mod\t$args";
668
669    # I only have ",u" completer, it's implicitly encoded...
670    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
671    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
672	my $len=32-$3;
673	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
674	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
675	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
676    }
677    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
678    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
679	my $len=32-$2;
680	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
681	$opcode |= (1<<13) if ($mod =~ /,\**=/);
682	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
683    }
684    else { "\t".$orig; }
685};
686
687my $shrpd = sub {
688  my ($mod,$args) = @_;
689  my $orig = "shrpd$mod\t$args";
690
691    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
692    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
693	my $cpos=63-$3;
694	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
695	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
696    }
697    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
698    {	sprintf "\t.WORD\t0x%08x\t; %s",
699		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
700    }
701    else { "\t".$orig; }
702};
703
704my $depd = sub {
705  my ($mod,$args) = @_;
706  my $orig = "depd$mod\t$args";
707
708    # I only have ",z" completer, it's implicitly encoded...
709    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
710    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
711    	my $cpos=63-$2;
712	my $len=32-$3;
713	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
714	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
715	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
716    }
717    else { "\t".$orig; }
718};
719
720sub assemble {
721  my ($mnemonic,$mod,$args)=@_;
722  my $opcode = eval("\$$mnemonic");
723
724    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
725}
726
727if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
728	=~ /GNU assembler/) {
729    $gnuas = 1;
730}
731
732foreach (split("\n",$code)) {
733	s/\`([^\`]*)\`/eval $1/ge;
734	if ($SIZE_T==4) {
735		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
736		s/cmpb,\*/comb,/;
737		s/,\*/,/;
738	}
739
740	s/(\.LEVEL\s+2\.0)W/$1w/	if ($gnuas && $SIZE_T==8);
741	s/\.SPACE\s+\$TEXT\$/.text/	if ($gnuas && $SIZE_T==8);
742	s/\.SUBSPA.*//			if ($gnuas && $SIZE_T==8);
743	s/\bbv\b/bve/			if ($SIZE_T==8);
744
745	print $_,"\n";
746}
747
748close STDOUT or die "error closing STDOUT: $!";
749