• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#if defined(__arm__)
2#include <openssl/arm_arch.h>
3
4@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
5@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
6@ instructions are in aesv8-armx.pl.)
7.arch	armv7-a
8
9.text
10#if defined(__thumb2__) || defined(__clang__)
11.syntax	unified
12#endif
13#if defined(__thumb2__)
14.thumb
15#else
16.code	32
17#endif
18
19#ifdef  __clang__
20#define ldrplb  ldrbpl
21#define ldrneb  ldrbne
22#endif
23
24.type	rem_4bit,%object
25.align	5
26rem_4bit:
27.short	0x0000,0x1C20,0x3840,0x2460
28.short	0x7080,0x6CA0,0x48C0,0x54E0
29.short	0xE100,0xFD20,0xD940,0xC560
30.short	0x9180,0x8DA0,0xA9C0,0xB5E0
31.size	rem_4bit,.-rem_4bit
32
33.type	rem_4bit_get,%function
34rem_4bit_get:
35#if defined(__thumb2__)
36	adr	r2,rem_4bit
37#else
38	sub	r2,pc,#8+32	@ &rem_4bit
39#endif
40	b	.Lrem_4bit_got
41	nop
42	nop
43.size	rem_4bit_get,.-rem_4bit_get
44
45.globl	gcm_ghash_4bit
46.hidden	gcm_ghash_4bit
47.type	gcm_ghash_4bit,%function
48.align	4
49gcm_ghash_4bit:
50#if defined(__thumb2__)
51	adr	r12,rem_4bit
52#else
53	sub	r12,pc,#8+48		@ &rem_4bit
54#endif
55	add	r3,r2,r3		@ r3 to point at the end
56	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}		@ save r3/end too
57
58	ldmia	r12,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy rem_4bit ...
59	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ ... to stack
60
61	ldrb	r12,[r2,#15]
62	ldrb	r14,[r0,#15]
63.Louter:
64	eor	r12,r12,r14
65	and	r14,r12,#0xf0
66	and	r12,r12,#0x0f
67	mov	r3,#14
68
69	add	r7,r1,r12,lsl#4
70	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
71	add	r11,r1,r14
72	ldrb	r12,[r2,#14]
73
74	and	r14,r4,#0xf		@ rem
75	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
76	add	r14,r14,r14
77	eor	r4,r8,r4,lsr#4
78	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
79	eor	r4,r4,r5,lsl#28
80	ldrb	r14,[r0,#14]
81	eor	r5,r9,r5,lsr#4
82	eor	r5,r5,r6,lsl#28
83	eor	r6,r10,r6,lsr#4
84	eor	r6,r6,r7,lsl#28
85	eor	r7,r11,r7,lsr#4
86	eor	r12,r12,r14
87	and	r14,r12,#0xf0
88	and	r12,r12,#0x0f
89	eor	r7,r7,r8,lsl#16
90
91.Linner:
92	add	r11,r1,r12,lsl#4
93	and	r12,r4,#0xf		@ rem
94	subs	r3,r3,#1
95	add	r12,r12,r12
96	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
97	eor	r4,r8,r4,lsr#4
98	eor	r4,r4,r5,lsl#28
99	eor	r5,r9,r5,lsr#4
100	eor	r5,r5,r6,lsl#28
101	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
102	eor	r6,r10,r6,lsr#4
103#ifdef	__thumb2__
104	it	pl
105#endif
106	ldrplb	r12,[r2,r3]
107	eor	r6,r6,r7,lsl#28
108	eor	r7,r11,r7,lsr#4
109
110	add	r11,r1,r14
111	and	r14,r4,#0xf		@ rem
112	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
113	add	r14,r14,r14
114	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
115	eor	r4,r8,r4,lsr#4
116#ifdef	__thumb2__
117	it	pl
118#endif
119	ldrplb	r8,[r0,r3]
120	eor	r4,r4,r5,lsl#28
121	eor	r5,r9,r5,lsr#4
122	ldrh	r9,[sp,r14]
123	eor	r5,r5,r6,lsl#28
124	eor	r6,r10,r6,lsr#4
125	eor	r6,r6,r7,lsl#28
126#ifdef	__thumb2__
127	it	pl
128#endif
129	eorpl	r12,r12,r8
130	eor	r7,r11,r7,lsr#4
131#ifdef	__thumb2__
132	itt	pl
133#endif
134	andpl	r14,r12,#0xf0
135	andpl	r12,r12,#0x0f
136	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
137	bpl	.Linner
138
139	ldr	r3,[sp,#32]		@ re-load r3/end
140	add	r2,r2,#16
141	mov	r14,r4
142#if __ARM_ARCH__>=7 && defined(__ARMEL__)
143	rev	r4,r4
144	str	r4,[r0,#12]
145#elif defined(__ARMEB__)
146	str	r4,[r0,#12]
147#else
148	mov	r9,r4,lsr#8
149	strb	r4,[r0,#12+3]
150	mov	r10,r4,lsr#16
151	strb	r9,[r0,#12+2]
152	mov	r11,r4,lsr#24
153	strb	r10,[r0,#12+1]
154	strb	r11,[r0,#12]
155#endif
156	cmp	r2,r3
157#if __ARM_ARCH__>=7 && defined(__ARMEL__)
158	rev	r5,r5
159	str	r5,[r0,#8]
160#elif defined(__ARMEB__)
161	str	r5,[r0,#8]
162#else
163	mov	r9,r5,lsr#8
164	strb	r5,[r0,#8+3]
165	mov	r10,r5,lsr#16
166	strb	r9,[r0,#8+2]
167	mov	r11,r5,lsr#24
168	strb	r10,[r0,#8+1]
169	strb	r11,[r0,#8]
170#endif
171
172#ifdef __thumb2__
173	it	ne
174#endif
175	ldrneb	r12,[r2,#15]
176#if __ARM_ARCH__>=7 && defined(__ARMEL__)
177	rev	r6,r6
178	str	r6,[r0,#4]
179#elif defined(__ARMEB__)
180	str	r6,[r0,#4]
181#else
182	mov	r9,r6,lsr#8
183	strb	r6,[r0,#4+3]
184	mov	r10,r6,lsr#16
185	strb	r9,[r0,#4+2]
186	mov	r11,r6,lsr#24
187	strb	r10,[r0,#4+1]
188	strb	r11,[r0,#4]
189#endif
190
191#if __ARM_ARCH__>=7 && defined(__ARMEL__)
192	rev	r7,r7
193	str	r7,[r0,#0]
194#elif defined(__ARMEB__)
195	str	r7,[r0,#0]
196#else
197	mov	r9,r7,lsr#8
198	strb	r7,[r0,#0+3]
199	mov	r10,r7,lsr#16
200	strb	r9,[r0,#0+2]
201	mov	r11,r7,lsr#24
202	strb	r10,[r0,#0+1]
203	strb	r11,[r0,#0]
204#endif
205
206	bne	.Louter
207
208	add	sp,sp,#36
209#if __ARM_ARCH__>=5
210	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
211#else
212	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
213	tst	lr,#1
214	moveq	pc,lr			@ be binary compatible with V4, yet
215.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
216#endif
217.size	gcm_ghash_4bit,.-gcm_ghash_4bit
218
219.globl	gcm_gmult_4bit
220.hidden	gcm_gmult_4bit
221.type	gcm_gmult_4bit,%function
222gcm_gmult_4bit:
223	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
224	ldrb	r12,[r0,#15]
225	b	rem_4bit_get
226.Lrem_4bit_got:
227	and	r14,r12,#0xf0
228	and	r12,r12,#0x0f
229	mov	r3,#14
230
231	add	r7,r1,r12,lsl#4
232	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
233	ldrb	r12,[r0,#14]
234
235	add	r11,r1,r14
236	and	r14,r4,#0xf		@ rem
237	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
238	add	r14,r14,r14
239	eor	r4,r8,r4,lsr#4
240	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
241	eor	r4,r4,r5,lsl#28
242	eor	r5,r9,r5,lsr#4
243	eor	r5,r5,r6,lsl#28
244	eor	r6,r10,r6,lsr#4
245	eor	r6,r6,r7,lsl#28
246	eor	r7,r11,r7,lsr#4
247	and	r14,r12,#0xf0
248	eor	r7,r7,r8,lsl#16
249	and	r12,r12,#0x0f
250
251.Loop:
252	add	r11,r1,r12,lsl#4
253	and	r12,r4,#0xf		@ rem
254	subs	r3,r3,#1
255	add	r12,r12,r12
256	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
257	eor	r4,r8,r4,lsr#4
258	eor	r4,r4,r5,lsl#28
259	eor	r5,r9,r5,lsr#4
260	eor	r5,r5,r6,lsl#28
261	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
262	eor	r6,r10,r6,lsr#4
263#ifdef	__thumb2__
264	it	pl
265#endif
266	ldrplb	r12,[r0,r3]
267	eor	r6,r6,r7,lsl#28
268	eor	r7,r11,r7,lsr#4
269
270	add	r11,r1,r14
271	and	r14,r4,#0xf		@ rem
272	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
273	add	r14,r14,r14
274	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
275	eor	r4,r8,r4,lsr#4
276	eor	r4,r4,r5,lsl#28
277	eor	r5,r9,r5,lsr#4
278	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
279	eor	r5,r5,r6,lsl#28
280	eor	r6,r10,r6,lsr#4
281	eor	r6,r6,r7,lsl#28
282	eor	r7,r11,r7,lsr#4
283#ifdef	__thumb2__
284	itt	pl
285#endif
286	andpl	r14,r12,#0xf0
287	andpl	r12,r12,#0x0f
288	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
289	bpl	.Loop
290#if __ARM_ARCH__>=7 && defined(__ARMEL__)
291	rev	r4,r4
292	str	r4,[r0,#12]
293#elif defined(__ARMEB__)
294	str	r4,[r0,#12]
295#else
296	mov	r9,r4,lsr#8
297	strb	r4,[r0,#12+3]
298	mov	r10,r4,lsr#16
299	strb	r9,[r0,#12+2]
300	mov	r11,r4,lsr#24
301	strb	r10,[r0,#12+1]
302	strb	r11,[r0,#12]
303#endif
304
305#if __ARM_ARCH__>=7 && defined(__ARMEL__)
306	rev	r5,r5
307	str	r5,[r0,#8]
308#elif defined(__ARMEB__)
309	str	r5,[r0,#8]
310#else
311	mov	r9,r5,lsr#8
312	strb	r5,[r0,#8+3]
313	mov	r10,r5,lsr#16
314	strb	r9,[r0,#8+2]
315	mov	r11,r5,lsr#24
316	strb	r10,[r0,#8+1]
317	strb	r11,[r0,#8]
318#endif
319
320#if __ARM_ARCH__>=7 && defined(__ARMEL__)
321	rev	r6,r6
322	str	r6,[r0,#4]
323#elif defined(__ARMEB__)
324	str	r6,[r0,#4]
325#else
326	mov	r9,r6,lsr#8
327	strb	r6,[r0,#4+3]
328	mov	r10,r6,lsr#16
329	strb	r9,[r0,#4+2]
330	mov	r11,r6,lsr#24
331	strb	r10,[r0,#4+1]
332	strb	r11,[r0,#4]
333#endif
334
335#if __ARM_ARCH__>=7 && defined(__ARMEL__)
336	rev	r7,r7
337	str	r7,[r0,#0]
338#elif defined(__ARMEB__)
339	str	r7,[r0,#0]
340#else
341	mov	r9,r7,lsr#8
342	strb	r7,[r0,#0+3]
343	mov	r10,r7,lsr#16
344	strb	r9,[r0,#0+2]
345	mov	r11,r7,lsr#24
346	strb	r10,[r0,#0+1]
347	strb	r11,[r0,#0]
348#endif
349
350#if __ARM_ARCH__>=5
351	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
352#else
353	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
354	tst	lr,#1
355	moveq	pc,lr			@ be binary compatible with V4, yet
356.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
357#endif
358.size	gcm_gmult_4bit,.-gcm_gmult_4bit
359#if __ARM_MAX_ARCH__>=7
360.arch	armv7-a
361.fpu	neon
362
363.globl	gcm_init_neon
364.hidden	gcm_init_neon
365.type	gcm_init_neon,%function
366.align	4
367gcm_init_neon:
368	vld1.64	d7,[r1]!		@ load H
369	vmov.i8	q8,#0xe1
370	vld1.64	d6,[r1]
371	vshl.i64	d17,#57
372	vshr.u64	d16,#63		@ t0=0xc2....01
373	vdup.8	q9,d7[7]
374	vshr.u64	d26,d6,#63
375	vshr.s8	q9,#7			@ broadcast carry bit
376	vshl.i64	q3,q3,#1
377	vand	q8,q8,q9
378	vorr	d7,d26		@ H<<<=1
379	veor	q3,q3,q8		@ twisted H
380	vstmia	r0,{q3}
381
382	bx	lr					@ bx lr
383.size	gcm_init_neon,.-gcm_init_neon
384
385.globl	gcm_gmult_neon
386.hidden	gcm_gmult_neon
387.type	gcm_gmult_neon,%function
388.align	4
389gcm_gmult_neon:
390	vld1.64	d7,[r0]!		@ load Xi
391	vld1.64	d6,[r0]!
392	vmov.i64	d29,#0x0000ffffffffffff
393	vldmia	r1,{d26,d27}	@ load twisted H
394	vmov.i64	d30,#0x00000000ffffffff
395#ifdef __ARMEL__
396	vrev64.8	q3,q3
397#endif
398	vmov.i64	d31,#0x000000000000ffff
399	veor	d28,d26,d27		@ Karatsuba pre-processing
400	mov	r3,#16
401	b	.Lgmult_neon
402.size	gcm_gmult_neon,.-gcm_gmult_neon
403
404.globl	gcm_ghash_neon
405.hidden	gcm_ghash_neon
406.type	gcm_ghash_neon,%function
407.align	4
408gcm_ghash_neon:
409	vld1.64	d1,[r0]!		@ load Xi
410	vld1.64	d0,[r0]!
411	vmov.i64	d29,#0x0000ffffffffffff
412	vldmia	r1,{d26,d27}	@ load twisted H
413	vmov.i64	d30,#0x00000000ffffffff
414#ifdef __ARMEL__
415	vrev64.8	q0,q0
416#endif
417	vmov.i64	d31,#0x000000000000ffff
418	veor	d28,d26,d27		@ Karatsuba pre-processing
419
420.Loop_neon:
421	vld1.64	d7,[r2]!		@ load inp
422	vld1.64	d6,[r2]!
423#ifdef __ARMEL__
424	vrev64.8	q3,q3
425#endif
426	veor	q3,q0			@ inp^=Xi
427.Lgmult_neon:
428	vext.8	d16, d26, d26, #1	@ A1
429	vmull.p8	q8, d16, d6		@ F = A1*B
430	vext.8	d0, d6, d6, #1	@ B1
431	vmull.p8	q0, d26, d0		@ E = A*B1
432	vext.8	d18, d26, d26, #2	@ A2
433	vmull.p8	q9, d18, d6		@ H = A2*B
434	vext.8	d22, d6, d6, #2	@ B2
435	vmull.p8	q11, d26, d22		@ G = A*B2
436	vext.8	d20, d26, d26, #3	@ A3
437	veor	q8, q8, q0		@ L = E + F
438	vmull.p8	q10, d20, d6		@ J = A3*B
439	vext.8	d0, d6, d6, #3	@ B3
440	veor	q9, q9, q11		@ M = G + H
441	vmull.p8	q0, d26, d0		@ I = A*B3
442	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
443	vand	d17, d17, d29
444	vext.8	d22, d6, d6, #4	@ B4
445	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
446	vand	d19, d19, d30
447	vmull.p8	q11, d26, d22		@ K = A*B4
448	veor	q10, q10, q0		@ N = I + J
449	veor	d16, d16, d17
450	veor	d18, d18, d19
451	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
452	vand	d21, d21, d31
453	vext.8	q8, q8, q8, #15
454	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
455	vmov.i64	d23, #0
456	vext.8	q9, q9, q9, #14
457	veor	d20, d20, d21
458	vmull.p8	q0, d26, d6		@ D = A*B
459	vext.8	q11, q11, q11, #12
460	vext.8	q10, q10, q10, #13
461	veor	q8, q8, q9
462	veor	q10, q10, q11
463	veor	q0, q0, q8
464	veor	q0, q0, q10
465	veor	d6,d6,d7	@ Karatsuba pre-processing
466	vext.8	d16, d28, d28, #1	@ A1
467	vmull.p8	q8, d16, d6		@ F = A1*B
468	vext.8	d2, d6, d6, #1	@ B1
469	vmull.p8	q1, d28, d2		@ E = A*B1
470	vext.8	d18, d28, d28, #2	@ A2
471	vmull.p8	q9, d18, d6		@ H = A2*B
472	vext.8	d22, d6, d6, #2	@ B2
473	vmull.p8	q11, d28, d22		@ G = A*B2
474	vext.8	d20, d28, d28, #3	@ A3
475	veor	q8, q8, q1		@ L = E + F
476	vmull.p8	q10, d20, d6		@ J = A3*B
477	vext.8	d2, d6, d6, #3	@ B3
478	veor	q9, q9, q11		@ M = G + H
479	vmull.p8	q1, d28, d2		@ I = A*B3
480	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
481	vand	d17, d17, d29
482	vext.8	d22, d6, d6, #4	@ B4
483	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
484	vand	d19, d19, d30
485	vmull.p8	q11, d28, d22		@ K = A*B4
486	veor	q10, q10, q1		@ N = I + J
487	veor	d16, d16, d17
488	veor	d18, d18, d19
489	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
490	vand	d21, d21, d31
491	vext.8	q8, q8, q8, #15
492	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
493	vmov.i64	d23, #0
494	vext.8	q9, q9, q9, #14
495	veor	d20, d20, d21
496	vmull.p8	q1, d28, d6		@ D = A*B
497	vext.8	q11, q11, q11, #12
498	vext.8	q10, q10, q10, #13
499	veor	q8, q8, q9
500	veor	q10, q10, q11
501	veor	q1, q1, q8
502	veor	q1, q1, q10
503	vext.8	d16, d27, d27, #1	@ A1
504	vmull.p8	q8, d16, d7		@ F = A1*B
505	vext.8	d4, d7, d7, #1	@ B1
506	vmull.p8	q2, d27, d4		@ E = A*B1
507	vext.8	d18, d27, d27, #2	@ A2
508	vmull.p8	q9, d18, d7		@ H = A2*B
509	vext.8	d22, d7, d7, #2	@ B2
510	vmull.p8	q11, d27, d22		@ G = A*B2
511	vext.8	d20, d27, d27, #3	@ A3
512	veor	q8, q8, q2		@ L = E + F
513	vmull.p8	q10, d20, d7		@ J = A3*B
514	vext.8	d4, d7, d7, #3	@ B3
515	veor	q9, q9, q11		@ M = G + H
516	vmull.p8	q2, d27, d4		@ I = A*B3
517	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
518	vand	d17, d17, d29
519	vext.8	d22, d7, d7, #4	@ B4
520	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
521	vand	d19, d19, d30
522	vmull.p8	q11, d27, d22		@ K = A*B4
523	veor	q10, q10, q2		@ N = I + J
524	veor	d16, d16, d17
525	veor	d18, d18, d19
526	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
527	vand	d21, d21, d31
528	vext.8	q8, q8, q8, #15
529	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
530	vmov.i64	d23, #0
531	vext.8	q9, q9, q9, #14
532	veor	d20, d20, d21
533	vmull.p8	q2, d27, d7		@ D = A*B
534	vext.8	q11, q11, q11, #12
535	vext.8	q10, q10, q10, #13
536	veor	q8, q8, q9
537	veor	q10, q10, q11
538	veor	q2, q2, q8
539	veor	q2, q2, q10
540	veor	q1,q1,q0		@ Karatsuba post-processing
541	veor	q1,q1,q2
542	veor	d1,d1,d2
543	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
544
545	@ equivalent of reduction_avx from ghash-x86_64.pl
546	vshl.i64	q9,q0,#57		@ 1st phase
547	vshl.i64	q10,q0,#62
548	veor	q10,q10,q9		@
549	vshl.i64	q9,q0,#63
550	veor	q10, q10, q9		@
551	veor	d1,d1,d20	@
552	veor	d4,d4,d21
553
554	vshr.u64	q10,q0,#1		@ 2nd phase
555	veor	q2,q2,q0
556	veor	q0,q0,q10		@
557	vshr.u64	q10,q10,#6
558	vshr.u64	q0,q0,#1		@
559	veor	q0,q0,q2		@
560	veor	q0,q0,q10		@
561
562	subs	r3,#16
563	bne	.Loop_neon
564
565#ifdef __ARMEL__
566	vrev64.8	q0,q0
567#endif
568	sub	r0,#16
569	vst1.64	d1,[r0]!		@ write out Xi
570	vst1.64	d0,[r0]
571
572	bx	lr					@ bx lr
573.size	gcm_ghash_neon,.-gcm_ghash_neon
574#endif
575.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
576.align	2
577.align	2
578#endif
579