• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3.text
4.code	32
5
6.type	rem_4bit,%object
7.align	5
8rem_4bit:
9.short	0x0000,0x1C20,0x3840,0x2460
10.short	0x7080,0x6CA0,0x48C0,0x54E0
11.short	0xE100,0xFD20,0xD940,0xC560
12.short	0x9180,0x8DA0,0xA9C0,0xB5E0
13.size	rem_4bit,.-rem_4bit
14
15.type	rem_4bit_get,%function
16rem_4bit_get:
17	sub	r2,pc,#8
18	sub	r2,r2,#32	@ &rem_4bit
19	b	.Lrem_4bit_got
20	nop
21.size	rem_4bit_get,.-rem_4bit_get
22
23.global	gcm_ghash_4bit
24.type	gcm_ghash_4bit,%function
25gcm_ghash_4bit:
26	sub	r12,pc,#8
27	add	r3,r2,r3		@ r3 to point at the end
28	stmdb	sp!,{r3-r11,lr}		@ save r3/end too
29	sub	r12,r12,#48		@ &rem_4bit
30
31	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
32	stmdb	sp!,{r4-r11}		@ ... to stack
33
34	ldrb	r12,[r2,#15]
35	ldrb	r14,[r0,#15]
36.Louter:
37	eor	r12,r12,r14
38	and	r14,r12,#0xf0
39	and	r12,r12,#0x0f
40	mov	r3,#14
41
42	add	r7,r1,r12,lsl#4
43	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
44	add	r11,r1,r14
45	ldrb	r12,[r2,#14]
46
47	and	r14,r4,#0xf		@ rem
48	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
49	add	r14,r14,r14
50	eor	r4,r8,r4,lsr#4
51	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
52	eor	r4,r4,r5,lsl#28
53	ldrb	r14,[r0,#14]
54	eor	r5,r9,r5,lsr#4
55	eor	r5,r5,r6,lsl#28
56	eor	r6,r10,r6,lsr#4
57	eor	r6,r6,r7,lsl#28
58	eor	r7,r11,r7,lsr#4
59	eor	r12,r12,r14
60	and	r14,r12,#0xf0
61	and	r12,r12,#0x0f
62	eor	r7,r7,r8,lsl#16
63
64.Linner:
65	add	r11,r1,r12,lsl#4
66	and	r12,r4,#0xf		@ rem
67	subs	r3,r3,#1
68	add	r12,r12,r12
69	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
70	eor	r4,r8,r4,lsr#4
71	eor	r4,r4,r5,lsl#28
72	eor	r5,r9,r5,lsr#4
73	eor	r5,r5,r6,lsl#28
74	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
75	eor	r6,r10,r6,lsr#4
76	ldrplb	r12,[r2,r3]
77	eor	r6,r6,r7,lsl#28
78	eor	r7,r11,r7,lsr#4
79
80	add	r11,r1,r14
81	and	r14,r4,#0xf		@ rem
82	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
83	add	r14,r14,r14
84	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
85	eor	r4,r8,r4,lsr#4
86	ldrplb	r8,[r0,r3]
87	eor	r4,r4,r5,lsl#28
88	eor	r5,r9,r5,lsr#4
89	ldrh	r9,[sp,r14]
90	eor	r5,r5,r6,lsl#28
91	eor	r6,r10,r6,lsr#4
92	eor	r6,r6,r7,lsl#28
93	eorpl	r12,r12,r8
94	eor	r7,r11,r7,lsr#4
95	andpl	r14,r12,#0xf0
96	andpl	r12,r12,#0x0f
97	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
98	bpl	.Linner
99
100	ldr	r3,[sp,#32]		@ re-load r3/end
101	add	r2,r2,#16
102	mov	r14,r4
103#if __ARM_ARCH__>=7 && defined(__ARMEL__)
104	rev	r4,r4
105	str	r4,[r0,#12]
106#elif defined(__ARMEB__)
107	str	r4,[r0,#12]
108#else
109	mov	r9,r4,lsr#8
110	strb	r4,[r0,#12+3]
111	mov	r10,r4,lsr#16
112	strb	r9,[r0,#12+2]
113	mov	r11,r4,lsr#24
114	strb	r10,[r0,#12+1]
115	strb	r11,[r0,#12]
116#endif
117	cmp	r2,r3
118#if __ARM_ARCH__>=7 && defined(__ARMEL__)
119	rev	r5,r5
120	str	r5,[r0,#8]
121#elif defined(__ARMEB__)
122	str	r5,[r0,#8]
123#else
124	mov	r9,r5,lsr#8
125	strb	r5,[r0,#8+3]
126	mov	r10,r5,lsr#16
127	strb	r9,[r0,#8+2]
128	mov	r11,r5,lsr#24
129	strb	r10,[r0,#8+1]
130	strb	r11,[r0,#8]
131#endif
132	ldrneb	r12,[r2,#15]
133#if __ARM_ARCH__>=7 && defined(__ARMEL__)
134	rev	r6,r6
135	str	r6,[r0,#4]
136#elif defined(__ARMEB__)
137	str	r6,[r0,#4]
138#else
139	mov	r9,r6,lsr#8
140	strb	r6,[r0,#4+3]
141	mov	r10,r6,lsr#16
142	strb	r9,[r0,#4+2]
143	mov	r11,r6,lsr#24
144	strb	r10,[r0,#4+1]
145	strb	r11,[r0,#4]
146#endif
147
148#if __ARM_ARCH__>=7 && defined(__ARMEL__)
149	rev	r7,r7
150	str	r7,[r0,#0]
151#elif defined(__ARMEB__)
152	str	r7,[r0,#0]
153#else
154	mov	r9,r7,lsr#8
155	strb	r7,[r0,#0+3]
156	mov	r10,r7,lsr#16
157	strb	r9,[r0,#0+2]
158	mov	r11,r7,lsr#24
159	strb	r10,[r0,#0+1]
160	strb	r11,[r0,#0]
161#endif
162
163	bne	.Louter
164
165	add	sp,sp,#36
166#if __ARM_ARCH__>=5
167	ldmia	sp!,{r4-r11,pc}
168#else
169	ldmia	sp!,{r4-r11,lr}
170	tst	lr,#1
171	moveq	pc,lr			@ be binary compatible with V4, yet
172	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
173#endif
174.size	gcm_ghash_4bit,.-gcm_ghash_4bit
175
176.global	gcm_gmult_4bit
177.type	gcm_gmult_4bit,%function
178gcm_gmult_4bit:
179	stmdb	sp!,{r4-r11,lr}
180	ldrb	r12,[r0,#15]
181	b	rem_4bit_get
182.Lrem_4bit_got:
183	and	r14,r12,#0xf0
184	and	r12,r12,#0x0f
185	mov	r3,#14
186
187	add	r7,r1,r12,lsl#4
188	ldmia	r7,{r4-r7}	@ load Htbl[nlo]
189	ldrb	r12,[r0,#14]
190
191	add	r11,r1,r14
192	and	r14,r4,#0xf		@ rem
193	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
194	add	r14,r14,r14
195	eor	r4,r8,r4,lsr#4
196	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
197	eor	r4,r4,r5,lsl#28
198	eor	r5,r9,r5,lsr#4
199	eor	r5,r5,r6,lsl#28
200	eor	r6,r10,r6,lsr#4
201	eor	r6,r6,r7,lsl#28
202	eor	r7,r11,r7,lsr#4
203	and	r14,r12,#0xf0
204	eor	r7,r7,r8,lsl#16
205	and	r12,r12,#0x0f
206
207.Loop:
208	add	r11,r1,r12,lsl#4
209	and	r12,r4,#0xf		@ rem
210	subs	r3,r3,#1
211	add	r12,r12,r12
212	ldmia	r11,{r8-r11}	@ load Htbl[nlo]
213	eor	r4,r8,r4,lsr#4
214	eor	r4,r4,r5,lsl#28
215	eor	r5,r9,r5,lsr#4
216	eor	r5,r5,r6,lsl#28
217	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
218	eor	r6,r10,r6,lsr#4
219	ldrplb	r12,[r0,r3]
220	eor	r6,r6,r7,lsl#28
221	eor	r7,r11,r7,lsr#4
222
223	add	r11,r1,r14
224	and	r14,r4,#0xf		@ rem
225	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
226	add	r14,r14,r14
227	ldmia	r11,{r8-r11}	@ load Htbl[nhi]
228	eor	r4,r8,r4,lsr#4
229	eor	r4,r4,r5,lsl#28
230	eor	r5,r9,r5,lsr#4
231	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
232	eor	r5,r5,r6,lsl#28
233	eor	r6,r10,r6,lsr#4
234	eor	r6,r6,r7,lsl#28
235	eor	r7,r11,r7,lsr#4
236	andpl	r14,r12,#0xf0
237	andpl	r12,r12,#0x0f
238	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
239	bpl	.Loop
240#if __ARM_ARCH__>=7 && defined(__ARMEL__)
241	rev	r4,r4
242	str	r4,[r0,#12]
243#elif defined(__ARMEB__)
244	str	r4,[r0,#12]
245#else
246	mov	r9,r4,lsr#8
247	strb	r4,[r0,#12+3]
248	mov	r10,r4,lsr#16
249	strb	r9,[r0,#12+2]
250	mov	r11,r4,lsr#24
251	strb	r10,[r0,#12+1]
252	strb	r11,[r0,#12]
253#endif
254
255#if __ARM_ARCH__>=7 && defined(__ARMEL__)
256	rev	r5,r5
257	str	r5,[r0,#8]
258#elif defined(__ARMEB__)
259	str	r5,[r0,#8]
260#else
261	mov	r9,r5,lsr#8
262	strb	r5,[r0,#8+3]
263	mov	r10,r5,lsr#16
264	strb	r9,[r0,#8+2]
265	mov	r11,r5,lsr#24
266	strb	r10,[r0,#8+1]
267	strb	r11,[r0,#8]
268#endif
269
270#if __ARM_ARCH__>=7 && defined(__ARMEL__)
271	rev	r6,r6
272	str	r6,[r0,#4]
273#elif defined(__ARMEB__)
274	str	r6,[r0,#4]
275#else
276	mov	r9,r6,lsr#8
277	strb	r6,[r0,#4+3]
278	mov	r10,r6,lsr#16
279	strb	r9,[r0,#4+2]
280	mov	r11,r6,lsr#24
281	strb	r10,[r0,#4+1]
282	strb	r11,[r0,#4]
283#endif
284
285#if __ARM_ARCH__>=7 && defined(__ARMEL__)
286	rev	r7,r7
287	str	r7,[r0,#0]
288#elif defined(__ARMEB__)
289	str	r7,[r0,#0]
290#else
291	mov	r9,r7,lsr#8
292	strb	r7,[r0,#0+3]
293	mov	r10,r7,lsr#16
294	strb	r9,[r0,#0+2]
295	mov	r11,r7,lsr#24
296	strb	r10,[r0,#0+1]
297	strb	r11,[r0,#0]
298#endif
299
300#if __ARM_ARCH__>=5
301	ldmia	sp!,{r4-r11,pc}
302#else
303	ldmia	sp!,{r4-r11,lr}
304	tst	lr,#1
305	moveq	pc,lr			@ be binary compatible with V4, yet
306	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
307#endif
308.size	gcm_gmult_4bit,.-gcm_gmult_4bit
309#if __ARM_ARCH__>=7
310.fpu	neon
311
312.global	gcm_gmult_neon
313.type	gcm_gmult_neon,%function
314.align	4
315gcm_gmult_neon:
316	sub		r1,#16		@ point at H in GCM128_CTX
317	vld1.64		d29,[r0,:64]!@ load Xi
318	vmov.i32	d5,#0xe1		@ our irreducible polynomial
319	vld1.64		d28,[r0,:64]!
320	vshr.u64	d5,#32
321	vldmia		r1,{d0-d1}	@ load H
322	veor		q12,q12
323#ifdef __ARMEL__
324	vrev64.8	q14,q14
325#endif
326	veor		q13,q13
327	veor		q11,q11
328	mov		r1,#16
329	veor		q10,q10
330	mov		r3,#16
331	veor		d2,d2
332	vdup.8		d4,d28[0]	@ broadcast lowest byte
333	b		.Linner_neon
334.size	gcm_gmult_neon,.-gcm_gmult_neon
335
336.global	gcm_ghash_neon
337.type	gcm_ghash_neon,%function
338.align	4
339gcm_ghash_neon:
340	vld1.64		d21,[r0,:64]!	@ load Xi
341	vmov.i32	d5,#0xe1		@ our irreducible polynomial
342	vld1.64		d20,[r0,:64]!
343	vshr.u64	d5,#32
344	vldmia		r0,{d0-d1}		@ load H
345	veor		q12,q12
346	nop
347#ifdef __ARMEL__
348	vrev64.8	q10,q10
349#endif
350.Louter_neon:
351	vld1.64		d29,[r2]!	@ load inp
352	veor		q13,q13
353	vld1.64		d28,[r2]!
354	veor		q11,q11
355	mov		r1,#16
356#ifdef __ARMEL__
357	vrev64.8	q14,q14
358#endif
359	veor		d2,d2
360	veor		q14,q10			@ inp^=Xi
361	veor		q10,q10
362	vdup.8		d4,d28[0]	@ broadcast lowest byte
363.Linner_neon:
364	subs		r1,r1,#1
365	vmull.p8	q9,d1,d4		@ H.loXi[i]
366	vmull.p8	q8,d0,d4		@ H.hiXi[i]
367	vext.8		q14,q12,#1		@ IN>>=8
368
369	veor		q10,q13		@ modulo-scheduled part
370	vshl.i64	d22,#48
371	vdup.8		d4,d28[0]	@ broadcast lowest byte
372	veor		d3,d18,d20
373
374	veor		d21,d22
375	vuzp.8		q9,q8
376	vsli.8		d2,d3,#1		@ compose the "carry" byte
377	vext.8		q10,q12,#1		@ Z>>=8
378
379	vmull.p8	q11,d2,d5		@ "carry"0xe1
380	vshr.u8		d2,d3,#7		@ save Z's bottom bit
381	vext.8		q13,q9,q12,#1	@ Qlo>>=8
382	veor		q10,q8
383	bne		.Linner_neon
384
385	veor		q10,q13		@ modulo-scheduled artefact
386	vshl.i64	d22,#48
387	veor		d21,d22
388
389	@ finalization, normalize Z:Zo
390	vand		d2,d5		@ suffices to mask the bit
391	vshr.u64	d3,d20,#63
392	vshl.i64	q10,#1
393	subs		r3,#16
394	vorr		q10,q1		@ Z=Z:Zo<<1
395	bne		.Louter_neon
396
397#ifdef __ARMEL__
398	vrev64.8	q10,q10
399#endif
400	sub		r0,#16
401	vst1.64		d21,[r0,:64]!	@ write out Xi
402	vst1.64		d20,[r0,:64]
403
404	.word	0xe12fff1e
405.size	gcm_ghash_neon,.-gcm_ghash_neon
406#endif
407.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
408.align  2
409