• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build arm,!gccgo,!appengine,!nacl
6
7#include "textflag.h"
8
9// This code was translated into a form compatible with 5a from the public
10// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
11
12DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
13DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
14DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
15DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
16DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
17GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
18
19// Warning: the linker may use R11 to synthesize certain instructions. Please
20// take care and verify that no synthetic instructions use it.
21
22TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
23	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
24	// might look like it's only 60 bytes of space but the final four bytes
25	// will be written by another function.) We need to skip over four
26	// bytes of stack because that's saving the value of 'g'.
27	ADD       $4, R13, R8
28	MOVM.IB   [R4-R7], (R8)
29	MOVM.IA.W (R1), [R2-R5]
30	MOVWpoly1305_init_constants_armv6<>(SB), R7
31	MOVW      R2, R8
32	MOVW      R2>>26, R9
33	MOVW      R3>>20, g
34	MOVW      R4>>14, R11
35	MOVW      R5>>8, R12
36	ORR       R3<<6, R9, R9
37	ORR       R4<<12, g, g
38	ORR       R5<<18, R11, R11
39	MOVM.IA   (R7), [R2-R6]
40	AND       R8, R2, R2
41	AND       R9, R3, R3
42	AND       g, R4, R4
43	AND       R11, R5, R5
44	AND       R12, R6, R6
45	MOVM.IA.W [R2-R6], (R0)
46	EOR       R2, R2, R2
47	EOR       R3, R3, R3
48	EOR       R4, R4, R4
49	EOR       R5, R5, R5
50	EOR       R6, R6, R6
51	MOVM.IA.W [R2-R6], (R0)
52	MOVM.IA.W (R1), [R2-R5]
53	MOVM.IA   [R2-R6], (R0)
54	ADD       $20, R13, R0
55	MOVM.DA   (R0), [R4-R7]
56	RET
57
58#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
59	MOVBU (offset+0)(Rsrc), Rtmp; \
60	MOVBU Rtmp, (offset+0)(Rdst); \
61	MOVBU (offset+1)(Rsrc), Rtmp; \
62	MOVBU Rtmp, (offset+1)(Rdst); \
63	MOVBU (offset+2)(Rsrc), Rtmp; \
64	MOVBU Rtmp, (offset+2)(Rdst); \
65	MOVBU (offset+3)(Rsrc), Rtmp; \
66	MOVBU Rtmp, (offset+3)(Rdst)
67
68TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
69	// Needs 24 bytes of stack for saved registers and then 88 bytes of
70	// scratch space after that. We assume that 24 bytes at (R13) have
71	// already been used: four bytes for the link register saved in the
72	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
73	// in that function and 16 bytes of scratch space used around
74	// poly1305_finish_ext_armv6_skip1.
75	ADD     $24, R13, R12
76	MOVM.IB [R4-R8, R14], (R12)
77	MOVW    R0, 88(R13)
78	MOVW    R1, 92(R13)
79	MOVW    R2, 96(R13)
80	MOVW    R1, R14
81	MOVW    R2, R12
82	MOVW    56(R0), R8
83	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
84	EOR     R6, R6, R6
85	MOVW.EQ $(1<<24), R6
86	MOVW    R6, 84(R13)
87	ADD     $116, R13, g
88	MOVM.IA (R0), [R0-R9]
89	MOVM.IA [R0-R4], (g)
90	CMP     $16, R12
91	BLO     poly1305_blocks_armv6_done
92
93poly1305_blocks_armv6_mainloop:
94	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
95	BEQ     poly1305_blocks_armv6_mainloop_aligned
96	ADD     $100, R13, g
97	MOVW_UNALIGNED(R14, g, R0, 0)
98	MOVW_UNALIGNED(R14, g, R0, 4)
99	MOVW_UNALIGNED(R14, g, R0, 8)
100	MOVW_UNALIGNED(R14, g, R0, 12)
101	MOVM.IA (g), [R0-R3]
102	ADD     $16, R14
103	B       poly1305_blocks_armv6_mainloop_loaded
104
105poly1305_blocks_armv6_mainloop_aligned:
106	MOVM.IA.W (R14), [R0-R3]
107
108poly1305_blocks_armv6_mainloop_loaded:
109	MOVW    R0>>26, g
110	MOVW    R1>>20, R11
111	MOVW    R2>>14, R12
112	MOVW    R14, 92(R13)
113	MOVW    R3>>8, R4
114	ORR     R1<<6, g, g
115	ORR     R2<<12, R11, R11
116	ORR     R3<<18, R12, R12
117	BIC     $0xfc000000, R0, R0
118	BIC     $0xfc000000, g, g
119	MOVW    84(R13), R3
120	BIC     $0xfc000000, R11, R11
121	BIC     $0xfc000000, R12, R12
122	ADD     R0, R5, R5
123	ADD     g, R6, R6
124	ORR     R3, R4, R4
125	ADD     R11, R7, R7
126	ADD     $116, R13, R14
127	ADD     R12, R8, R8
128	ADD     R4, R9, R9
129	MOVM.IA (R14), [R0-R4]
130	MULLU   R4, R5, (R11, g)
131	MULLU   R3, R5, (R14, R12)
132	MULALU  R3, R6, (R11, g)
133	MULALU  R2, R6, (R14, R12)
134	MULALU  R2, R7, (R11, g)
135	MULALU  R1, R7, (R14, R12)
136	ADD     R4<<2, R4, R4
137	ADD     R3<<2, R3, R3
138	MULALU  R1, R8, (R11, g)
139	MULALU  R0, R8, (R14, R12)
140	MULALU  R0, R9, (R11, g)
141	MULALU  R4, R9, (R14, R12)
142	MOVW    g, 76(R13)
143	MOVW    R11, 80(R13)
144	MOVW    R12, 68(R13)
145	MOVW    R14, 72(R13)
146	MULLU   R2, R5, (R11, g)
147	MULLU   R1, R5, (R14, R12)
148	MULALU  R1, R6, (R11, g)
149	MULALU  R0, R6, (R14, R12)
150	MULALU  R0, R7, (R11, g)
151	MULALU  R4, R7, (R14, R12)
152	ADD     R2<<2, R2, R2
153	ADD     R1<<2, R1, R1
154	MULALU  R4, R8, (R11, g)
155	MULALU  R3, R8, (R14, R12)
156	MULALU  R3, R9, (R11, g)
157	MULALU  R2, R9, (R14, R12)
158	MOVW    g, 60(R13)
159	MOVW    R11, 64(R13)
160	MOVW    R12, 52(R13)
161	MOVW    R14, 56(R13)
162	MULLU   R0, R5, (R11, g)
163	MULALU  R4, R6, (R11, g)
164	MULALU  R3, R7, (R11, g)
165	MULALU  R2, R8, (R11, g)
166	MULALU  R1, R9, (R11, g)
167	ADD     $52, R13, R0
168	MOVM.IA (R0), [R0-R7]
169	MOVW    g>>26, R12
170	MOVW    R4>>26, R14
171	ORR     R11<<6, R12, R12
172	ORR     R5<<6, R14, R14
173	BIC     $0xfc000000, g, g
174	BIC     $0xfc000000, R4, R4
175	ADD.S   R12, R0, R0
176	ADC     $0, R1, R1
177	ADD.S   R14, R6, R6
178	ADC     $0, R7, R7
179	MOVW    R0>>26, R12
180	MOVW    R6>>26, R14
181	ORR     R1<<6, R12, R12
182	ORR     R7<<6, R14, R14
183	BIC     $0xfc000000, R0, R0
184	BIC     $0xfc000000, R6, R6
185	ADD     R14<<2, R14, R14
186	ADD.S   R12, R2, R2
187	ADC     $0, R3, R3
188	ADD     R14, g, g
189	MOVW    R2>>26, R12
190	MOVW    g>>26, R14
191	ORR     R3<<6, R12, R12
192	BIC     $0xfc000000, g, R5
193	BIC     $0xfc000000, R2, R7
194	ADD     R12, R4, R4
195	ADD     R14, R0, R0
196	MOVW    R4>>26, R12
197	BIC     $0xfc000000, R4, R8
198	ADD     R12, R6, R9
199	MOVW    96(R13), R12
200	MOVW    92(R13), R14
201	MOVW    R0, R6
202	CMP     $32, R12
203	SUB     $16, R12, R12
204	MOVW    R12, 96(R13)
205	BHS     poly1305_blocks_armv6_mainloop
206
207poly1305_blocks_armv6_done:
208	MOVW    88(R13), R12
209	MOVW    R5, 20(R12)
210	MOVW    R6, 24(R12)
211	MOVW    R7, 28(R12)
212	MOVW    R8, 32(R12)
213	MOVW    R9, 36(R12)
214	ADD     $48, R13, R0
215	MOVM.DA (R0), [R4-R8, R14]
216	RET
217
218#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
219	MOVBU.P 1(Rsrc), Rtmp; \
220	MOVBU.P Rtmp, 1(Rdst); \
221	MOVBU.P 1(Rsrc), Rtmp; \
222	MOVBU.P Rtmp, 1(Rdst)
223
224#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
225	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
226	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
227
228// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
229TEXT ·poly1305_auth_armv6(SB), $196-16
230	// The value 196, just above, is the sum of 64 (the size of the context
231	// structure) and 132 (the amount of stack needed).
232	//
233	// At this point, the stack pointer (R13) has been moved down. It
234	// points to the saved link register and there's 196 bytes of free
235	// space above it.
236	//
237	// The stack for this function looks like:
238	//
239	// +---------------------
240	// |
241	// | 64 bytes of context structure
242	// |
243	// +---------------------
244	// |
245	// | 112 bytes for poly1305_blocks_armv6
246	// |
247	// +---------------------
248	// | 16 bytes of final block, constructed at
249	// | poly1305_finish_ext_armv6_skip8
250	// +---------------------
251	// | four bytes of saved 'g'
252	// +---------------------
253	// | lr, saved by prelude    <- R13 points here
254	// +---------------------
255	MOVW g, 4(R13)
256
257	MOVW out+0(FP), R4
258	MOVW m+4(FP), R5
259	MOVW mlen+8(FP), R6
260	MOVW key+12(FP), R7
261
262	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
263	MOVW R7, R1
264
265	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
266	// that's ok because none of the other values have been written yet.
267	BL    poly1305_init_ext_armv6<>(SB)
268	BIC.S $15, R6, R2
269	BEQ   poly1305_auth_armv6_noblocks
270	ADD   $136, R13, R0
271	MOVW  R5, R1
272	ADD   R2, R5, R5
273	SUB   R2, R6, R6
274	BL    poly1305_blocks_armv6<>(SB)
275
276poly1305_auth_armv6_noblocks:
277	ADD  $136, R13, R0
278	MOVW R5, R1
279	MOVW R6, R2
280	MOVW R4, R3
281
282	MOVW  R0, R5
283	MOVW  R1, R6
284	MOVW  R2, R7
285	MOVW  R3, R8
286	AND.S R2, R2, R2
287	BEQ   poly1305_finish_ext_armv6_noremaining
288	EOR   R0, R0
289	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
290	MOVW  R0, (R9)
291	MOVW  R0, 4(R9)
292	MOVW  R0, 8(R9)
293	MOVW  R0, 12(R9)
294	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
295	BEQ   poly1305_finish_ext_armv6_aligned
296	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
297	BEQ   poly1305_finish_ext_armv6_skip8
298	MOVWP_UNALIGNED(R1, R9, g)
299	MOVWP_UNALIGNED(R1, R9, g)
300
301poly1305_finish_ext_armv6_skip8:
302	WORD $0xe3120004                     // TST $4, R2 not working see issue 5921
303	BEQ  poly1305_finish_ext_armv6_skip4
304	MOVWP_UNALIGNED(R1, R9, g)
305
306poly1305_finish_ext_armv6_skip4:
307	WORD $0xe3120002                     // TST $2, R2 not working see issue 5921
308	BEQ  poly1305_finish_ext_armv6_skip2
309	MOVHUP_UNALIGNED(R1, R9, g)
310	B    poly1305_finish_ext_armv6_skip2
311
312poly1305_finish_ext_armv6_aligned:
313	WORD      $0xe3120008                             // TST R2, #8 not working see issue 5921
314	BEQ       poly1305_finish_ext_armv6_skip8_aligned
315	MOVM.IA.W (R1), [g-R11]
316	MOVM.IA.W [g-R11], (R9)
317
318poly1305_finish_ext_armv6_skip8_aligned:
319	WORD   $0xe3120004                             // TST $4, R2 not working see issue 5921
320	BEQ    poly1305_finish_ext_armv6_skip4_aligned
321	MOVW.P 4(R1), g
322	MOVW.P g, 4(R9)
323
324poly1305_finish_ext_armv6_skip4_aligned:
325	WORD    $0xe3120002                     // TST $2, R2 not working see issue 5921
326	BEQ     poly1305_finish_ext_armv6_skip2
327	MOVHU.P 2(R1), g
328	MOVH.P  g, 2(R9)
329
330poly1305_finish_ext_armv6_skip2:
331	WORD    $0xe3120001                     // TST $1, R2 not working see issue 5921
332	BEQ     poly1305_finish_ext_armv6_skip1
333	MOVBU.P 1(R1), g
334	MOVBU.P g, 1(R9)
335
336poly1305_finish_ext_armv6_skip1:
337	MOVW  $1, R11
338	MOVBU R11, 0(R9)
339	MOVW  R11, 56(R5)
340	MOVW  R5, R0
341	ADD   $8, R13, R1
342	MOVW  $16, R2
343	BL    poly1305_blocks_armv6<>(SB)
344
345poly1305_finish_ext_armv6_noremaining:
346	MOVW      20(R5), R0
347	MOVW      24(R5), R1
348	MOVW      28(R5), R2
349	MOVW      32(R5), R3
350	MOVW      36(R5), R4
351	MOVW      R4>>26, R12
352	BIC       $0xfc000000, R4, R4
353	ADD       R12<<2, R12, R12
354	ADD       R12, R0, R0
355	MOVW      R0>>26, R12
356	BIC       $0xfc000000, R0, R0
357	ADD       R12, R1, R1
358	MOVW      R1>>26, R12
359	BIC       $0xfc000000, R1, R1
360	ADD       R12, R2, R2
361	MOVW      R2>>26, R12
362	BIC       $0xfc000000, R2, R2
363	ADD       R12, R3, R3
364	MOVW      R3>>26, R12
365	BIC       $0xfc000000, R3, R3
366	ADD       R12, R4, R4
367	ADD       $5, R0, R6
368	MOVW      R6>>26, R12
369	BIC       $0xfc000000, R6, R6
370	ADD       R12, R1, R7
371	MOVW      R7>>26, R12
372	BIC       $0xfc000000, R7, R7
373	ADD       R12, R2, g
374	MOVW      g>>26, R12
375	BIC       $0xfc000000, g, g
376	ADD       R12, R3, R11
377	MOVW      $-(1<<26), R12
378	ADD       R11>>26, R12, R12
379	BIC       $0xfc000000, R11, R11
380	ADD       R12, R4, R9
381	MOVW      R9>>31, R12
382	SUB       $1, R12
383	AND       R12, R6, R6
384	AND       R12, R7, R7
385	AND       R12, g, g
386	AND       R12, R11, R11
387	AND       R12, R9, R9
388	MVN       R12, R12
389	AND       R12, R0, R0
390	AND       R12, R1, R1
391	AND       R12, R2, R2
392	AND       R12, R3, R3
393	AND       R12, R4, R4
394	ORR       R6, R0, R0
395	ORR       R7, R1, R1
396	ORR       g, R2, R2
397	ORR       R11, R3, R3
398	ORR       R9, R4, R4
399	ORR       R1<<26, R0, R0
400	MOVW      R1>>6, R1
401	ORR       R2<<20, R1, R1
402	MOVW      R2>>12, R2
403	ORR       R3<<14, R2, R2
404	MOVW      R3>>18, R3
405	ORR       R4<<8, R3, R3
406	MOVW      40(R5), R6
407	MOVW      44(R5), R7
408	MOVW      48(R5), g
409	MOVW      52(R5), R11
410	ADD.S     R6, R0, R0
411	ADC.S     R7, R1, R1
412	ADC.S     g, R2, R2
413	ADC.S     R11, R3, R3
414	MOVM.IA   [R0-R3], (R8)
415	MOVW      R5, R12
416	EOR       R0, R0, R0
417	EOR       R1, R1, R1
418	EOR       R2, R2, R2
419	EOR       R3, R3, R3
420	EOR       R4, R4, R4
421	EOR       R5, R5, R5
422	EOR       R6, R6, R6
423	EOR       R7, R7, R7
424	MOVM.IA.W [R0-R7], (R12)
425	MOVM.IA   [R0-R7], (R12)
426	MOVW      4(R13), g
427	RET
428