• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
2
3#ifndef _UECC_ASM_ARM_MULT_SQUARE_H_
4#define _UECC_ASM_ARM_MULT_SQUARE_H_
5
6#define FAST_MULT_ASM_5                \
7    "push {r3} \n\t"                   \
8    "add r0, 12 \n\t"                  \
9    "add r2, 12 \n\t"                  \
10    "ldmia r1!, {r3,r4} \n\t"          \
11    "ldmia r2!, {r6,r7} \n\t"          \
12                                       \
13    "umull r11, r12, r3, r6 \n\t"      \
14    "stmia r0!, {r11} \n\t"            \
15                                       \
16    "mov r10, #0 \n\t"                 \
17    "umull r11, r9, r3, r7 \n\t"       \
18    "adds r12, r12, r11 \n\t"          \
19    "adc r9, r9, #0 \n\t"              \
20    "umull r11, r14, r4, r6 \n\t"      \
21    "adds r12, r12, r11 \n\t"          \
22    "adcs r9, r9, r14 \n\t"            \
23    "adc r10, r10, #0 \n\t"            \
24    "stmia r0!, {r12} \n\t"            \
25                                       \
26    "umull r12, r14, r4, r7 \n\t"      \
27    "adds r9, r9, r12 \n\t"            \
28    "adc r10, r10, r14 \n\t"           \
29    "stmia r0!, {r9, r10} \n\t"        \
30                                       \
31    "sub r0, 28 \n\t"                  \
32    "sub r2, 20 \n\t"                  \
33    "ldmia r2!, {r6,r7,r8} \n\t"       \
34    "ldmia r1!, {r5} \n\t"             \
35                                       \
36    "umull r11, r12, r3, r6 \n\t"      \
37    "stmia r0!, {r11} \n\t"            \
38                                       \
39    "mov r10, #0 \n\t"                 \
40    "umull r11, r9, r3, r7 \n\t"       \
41    "adds r12, r12, r11 \n\t"          \
42    "adc r9, r9, #0 \n\t"              \
43    "umull r11, r14, r4, r6 \n\t"      \
44    "adds r12, r12, r11 \n\t"          \
45    "adcs r9, r9, r14 \n\t"            \
46    "adc r10, r10, #0 \n\t"            \
47    "stmia r0!, {r12} \n\t"            \
48                                       \
49    "mov r11, #0 \n\t"                 \
50    "umull r12, r14, r3, r8 \n\t"      \
51    "adds r9, r9, r12 \n\t"            \
52    "adcs r10, r10, r14 \n\t"          \
53    "adc r11, r11, #0 \n\t"            \
54    "umull r12, r14, r4, r7 \n\t"      \
55    "adds r9, r9, r12 \n\t"            \
56    "adcs r10, r10, r14 \n\t"          \
57    "adc r11, r11, #0 \n\t"            \
58    "umull r12, r14, r5, r6 \n\t"      \
59    "adds r9, r9, r12 \n\t"            \
60    "adcs r10, r10, r14 \n\t"          \
61    "adc r11, r11, #0 \n\t"            \
62    "stmia r0!, {r9} \n\t"             \
63                                       \
64    "ldmia r1!, {r3} \n\t"             \
65    "mov r12, #0 \n\t"                 \
66    "umull r14, r9, r4, r8 \n\t"       \
67    "adds r10, r10, r14 \n\t"          \
68    "adcs r11, r11, r9 \n\t"           \
69    "adc r12, r12, #0 \n\t"            \
70    "umull r14, r9, r5, r7 \n\t"       \
71    "adds r10, r10, r14 \n\t"          \
72    "adcs r11, r11, r9 \n\t"           \
73    "adc r12, r12, #0 \n\t"            \
74    "umull r14, r9, r3, r6 \n\t"       \
75    "adds r10, r10, r14 \n\t"          \
76    "adcs r11, r11, r9 \n\t"           \
77    "adc r12, r12, #0 \n\t"            \
78    "ldr r14, [r0] \n\t"               \
79    "adds r10, r10, r14 \n\t"          \
80    "adcs r11, r11, #0 \n\t"           \
81    "adc r12, r12, #0 \n\t"            \
82    "stmia r0!, {r10} \n\t"            \
83                                       \
84    "ldmia r1!, {r4} \n\t"             \
85    "mov r14, #0 \n\t"                 \
86    "umull r9, r10, r5, r8 \n\t"       \
87    "adds r11, r11, r9 \n\t"           \
88    "adcs r12, r12, r10 \n\t"          \
89    "adc r14, r14, #0 \n\t"            \
90    "umull r9, r10, r3, r7 \n\t"       \
91    "adds r11, r11, r9 \n\t"           \
92    "adcs r12, r12, r10 \n\t"          \
93    "adc r14, r14, #0 \n\t"            \
94    "umull r9, r10, r4, r6 \n\t"       \
95    "adds r11, r11, r9 \n\t"           \
96    "adcs r12, r12, r10 \n\t"          \
97    "adc r14, r14, #0 \n\t"            \
98    "ldr r9, [r0] \n\t"                \
99    "adds r11, r11, r9 \n\t"           \
100    "adcs r12, r12, #0 \n\t"           \
101    "adc r14, r14, #0 \n\t"            \
102    "stmia r0!, {r11} \n\t"            \
103                                       \
104    "ldmia r2!, {r6} \n\t"             \
105    "mov r9, #0 \n\t"                  \
106    "umull r10, r11, r5, r6 \n\t"      \
107    "adds r12, r12, r10 \n\t"          \
108    "adcs r14, r14, r11 \n\t"          \
109    "adc r9, r9, #0 \n\t"              \
110    "umull r10, r11, r3, r8 \n\t"      \
111    "adds r12, r12, r10 \n\t"          \
112    "adcs r14, r14, r11 \n\t"          \
113    "adc r9, r9, #0 \n\t"              \
114    "umull r10, r11, r4, r7 \n\t"      \
115    "adds r12, r12, r10 \n\t"          \
116    "adcs r14, r14, r11 \n\t"          \
117    "adc r9, r9, #0 \n\t"              \
118    "ldr r10, [r0] \n\t"               \
119    "adds r12, r12, r10 \n\t"          \
120    "adcs r14, r14, #0 \n\t"           \
121    "adc r9, r9, #0 \n\t"              \
122    "stmia r0!, {r12} \n\t"            \
123                                       \
124    "ldmia r2!, {r7} \n\t"             \
125    "mov r10, #0 \n\t"                 \
126    "umull r11, r12, r5, r7 \n\t"      \
127    "adds r14, r14, r11 \n\t"          \
128    "adcs r9, r9, r12 \n\t"            \
129    "adc r10, r10, #0 \n\t"            \
130    "umull r11, r12, r3, r6 \n\t"      \
131    "adds r14, r14, r11 \n\t"          \
132    "adcs r9, r9, r12 \n\t"            \
133    "adc r10, r10, #0 \n\t"            \
134    "umull r11, r12, r4, r8 \n\t"      \
135    "adds r14, r14, r11 \n\t"          \
136    "adcs r9, r9, r12 \n\t"            \
137    "adc r10, r10, #0 \n\t"            \
138    "ldr r11, [r0] \n\t"               \
139    "adds r14, r14, r11 \n\t"          \
140    "adcs r9, r9, #0 \n\t"             \
141    "adc r10, r10, #0 \n\t"            \
142    "stmia r0!, {r14} \n\t"            \
143                                       \
144    "mov r11, #0 \n\t"                 \
145    "umull r12, r14, r3, r7 \n\t"      \
146    "adds r9, r9, r12 \n\t"            \
147    "adcs r10, r10, r14 \n\t"          \
148    "adc r11, r11, #0 \n\t"            \
149    "umull r12, r14, r4, r6 \n\t"      \
150    "adds r9, r9, r12 \n\t"            \
151    "adcs r10, r10, r14 \n\t"          \
152    "adc r11, r11, #0 \n\t"            \
153    "stmia r0!, {r9} \n\t"             \
154                                       \
155    "umull r14, r9, r4, r7 \n\t"       \
156    "adds r10, r10, r14 \n\t"          \
157    "adc r11, r11, r9 \n\t"            \
158    "stmia r0!, {r10, r11} \n\t"       \
159    "pop {r3} \n\t"
160
161#define FAST_MULT_ASM_5_TO_6                 \
162    "cmp r3, #5 \n\t"                        \
163    "beq 1f \n\t"                            \
164                                             \
165    /* r4 = left high, r5 = right high */    \
166    "ldr r4, [r1] \n\t"                      \
167    "ldr r5, [r2] \n\t"                      \
168                                             \
169    "sub r0, #20 \n\t"                       \
170    "sub r1, #20 \n\t"                       \
171    "sub r2, #20 \n\t"                       \
172                                             \
173    "ldr r6, [r0] \n\t"                      \
174    "ldr r7, [r1], #4 \n\t"                  \
175    "ldr r8, [r2], #4 \n\t"                  \
176    "mov r14, #0 \n\t"                       \
177    "umull r9, r10, r4, r8 \n\t"             \
178    "umull r11, r12, r5, r7 \n\t"            \
179    "adds r9, r9, r6 \n\t"                   \
180    "adc r10, r10, #0 \n\t"                  \
181    "adds r9, r9, r11 \n\t"                  \
182    "adcs r10, r10, r12 \n\t"                \
183    "adc r14, r14, #0 \n\t"                  \
184    "str r9, [r0], #4 \n\t"                  \
185                                             \
186    "ldr r6, [r0] \n\t"                      \
187    "adds r10, r10, r6 \n\t"                 \
188    "adcs r14, r14, #0 \n\t"                 \
189    "ldr r7, [r1], #4 \n\t"                  \
190    "ldr r8, [r2], #4 \n\t"                  \
191    "mov r9, #0 \n\t"                        \
192    "umull r11, r12, r4, r8 \n\t"            \
193    "adds r10, r10, r11 \n\t"                \
194    "adcs r14, r14, r12 \n\t"                \
195    "adc r9, r9, #0 \n\t"                    \
196    "umull r11, r12, r5, r7 \n\t"            \
197    "adds r10, r10, r11 \n\t"                \
198    "adcs r14, r14, r12 \n\t"                \
199    "adc r9, r9, #0 \n\t"                    \
200    "str r10, [r0], #4 \n\t"                 \
201                                             \
202    "ldr r6, [r0] \n\t"                      \
203    "adds r14, r14, r6 \n\t"                 \
204    "adcs r9, r9, #0 \n\t"                   \
205    "ldr r7, [r1], #4 \n\t"                  \
206    "ldr r8, [r2], #4 \n\t"                  \
207    "mov r10, #0 \n\t"                       \
208    "umull r11, r12, r4, r8 \n\t"            \
209    "adds r14, r14, r11 \n\t"                \
210    "adcs r9, r9, r12 \n\t"                  \
211    "adc r10, r10, #0 \n\t"                  \
212    "umull r11, r12, r5, r7 \n\t"            \
213    "adds r14, r14, r11 \n\t"                \
214    "adcs r9, r9, r12 \n\t"                  \
215    "adc r10, r10, #0 \n\t"                  \
216    "str r14, [r0], #4 \n\t"                 \
217                                             \
218    "ldr r6, [r0] \n\t"                      \
219    "adds r9, r9, r6 \n\t"                   \
220    "adcs r10, r10, #0 \n\t"                 \
221    "ldr r7, [r1], #4 \n\t"                  \
222    "ldr r8, [r2], #4 \n\t"                  \
223    "mov r14, #0 \n\t"                       \
224    "umull r11, r12, r4, r8 \n\t"            \
225    "adds r9, r9, r11 \n\t"                  \
226    "adcs r10, r10, r12 \n\t"                \
227    "adc r14, r14, #0 \n\t"                  \
228    "umull r11, r12, r5, r7 \n\t"            \
229    "adds r9, r9, r11 \n\t"                  \
230    "adcs r10, r10, r12 \n\t"                \
231    "adc r14, r14, #0 \n\t"                  \
232    "str r9, [r0], #4 \n\t"                  \
233                                             \
234    "ldr r6, [r0] \n\t"                      \
235    "adds r10, r10, r6 \n\t"                 \
236    "adcs r14, r14, #0 \n\t"                 \
237    /* skip past already-loaded (r4, r5) */  \
238    "ldr r7, [r1], #8 \n\t"                  \
239    "ldr r8, [r2], #8 \n\t"                  \
240    "mov r9, #0 \n\t"                        \
241    "umull r11, r12, r4, r8 \n\t"            \
242    "adds r10, r10, r11 \n\t"                \
243    "adcs r14, r14, r12 \n\t"                \
244    "adc r9, r9, #0 \n\t"                    \
245    "umull r11, r12, r5, r7 \n\t"            \
246    "adds r10, r10, r11 \n\t"                \
247    "adcs r14, r14, r12 \n\t"                \
248    "adc r9, r9, #0 \n\t"                    \
249    "str r10, [r0], #4 \n\t"                 \
250                                             \
251    "umull r11, r12, r4, r5 \n\t"            \
252    "adds r11, r11, r14 \n\t"                \
253    "adc r12, r12, r9 \n\t"                  \
254    "stmia r0!, {r11, r12} \n\t"
255
256#define FAST_MULT_ASM_6             \
257    "push {r3} \n\t"                \
258    "add r0, 12 \n\t"               \
259    "add r2, 12 \n\t"               \
260    "ldmia r1!, {r3,r4,r5} \n\t"    \
261    "ldmia r2!, {r6,r7,r8} \n\t"    \
262                                    \
263    "umull r11, r12, r3, r6 \n\t"   \
264    "stmia r0!, {r11} \n\t"         \
265                                    \
266    "mov r10, #0 \n\t"              \
267    "umull r11, r9, r3, r7 \n\t"    \
268    "adds r12, r12, r11 \n\t"       \
269    "adc r9, r9, #0 \n\t"           \
270    "umull r11, r14, r4, r6 \n\t"   \
271    "adds r12, r12, r11 \n\t"       \
272    "adcs r9, r9, r14 \n\t"         \
273    "adc r10, r10, #0 \n\t"         \
274    "stmia r0!, {r12} \n\t"         \
275                                    \
276    "mov r11, #0 \n\t"              \
277    "umull r12, r14, r3, r8 \n\t"   \
278    "adds r9, r9, r12 \n\t"         \
279    "adcs r10, r10, r14 \n\t"       \
280    "adc r11, r11, #0 \n\t"         \
281    "umull r12, r14, r4, r7 \n\t"   \
282    "adds r9, r9, r12 \n\t"         \
283    "adcs r10, r10, r14 \n\t"       \
284    "adc r11, r11, #0 \n\t"         \
285    "umull r12, r14, r5, r6 \n\t"   \
286    "adds r9, r9, r12 \n\t"         \
287    "adcs r10, r10, r14 \n\t"       \
288    "adc r11, r11, #0 \n\t"         \
289    "stmia r0!, {r9} \n\t"          \
290                                    \
291    "mov r12, #0 \n\t"              \
292    "umull r14, r9, r4, r8 \n\t"    \
293    "adds r10, r10, r14 \n\t"       \
294    "adcs r11, r11, r9 \n\t"        \
295    "adc r12, r12, #0 \n\t"         \
296    "umull r14, r9, r5, r7 \n\t"    \
297    "adds r10, r10, r14 \n\t"       \
298    "adcs r11, r11, r9 \n\t"        \
299    "adc r12, r12, #0 \n\t"         \
300    "stmia r0!, {r10} \n\t"         \
301                                    \
302    "umull r9, r10, r5, r8 \n\t"    \
303    "adds r11, r11, r9 \n\t"        \
304    "adc r12, r12, r10 \n\t"        \
305    "stmia r0!, {r11, r12} \n\t"    \
306                                    \
307    "sub r0, 36 \n\t"               \
308    "sub r2, 24 \n\t"               \
309    "ldmia r2!, {r6,r7,r8} \n\t"    \
310                                    \
311    "umull r11, r12, r3, r6 \n\t"   \
312    "stmia r0!, {r11} \n\t"         \
313                                    \
314    "mov r10, #0 \n\t"              \
315    "umull r11, r9, r3, r7 \n\t"    \
316    "adds r12, r12, r11 \n\t"       \
317    "adc r9, r9, #0 \n\t"           \
318    "umull r11, r14, r4, r6 \n\t"   \
319    "adds r12, r12, r11 \n\t"       \
320    "adcs r9, r9, r14 \n\t"         \
321    "adc r10, r10, #0 \n\t"         \
322    "stmia r0!, {r12} \n\t"         \
323                                    \
324    "mov r11, #0 \n\t"              \
325    "umull r12, r14, r3, r8 \n\t"   \
326    "adds r9, r9, r12 \n\t"         \
327    "adcs r10, r10, r14 \n\t"       \
328    "adc r11, r11, #0 \n\t"         \
329    "umull r12, r14, r4, r7 \n\t"   \
330    "adds r9, r9, r12 \n\t"         \
331    "adcs r10, r10, r14 \n\t"       \
332    "adc r11, r11, #0 \n\t"         \
333    "umull r12, r14, r5, r6 \n\t"   \
334    "adds r9, r9, r12 \n\t"         \
335    "adcs r10, r10, r14 \n\t"       \
336    "adc r11, r11, #0 \n\t"         \
337    "stmia r0!, {r9} \n\t"          \
338                                    \
339    "ldmia r1!, {r3} \n\t"          \
340    "mov r12, #0 \n\t"              \
341    "umull r14, r9, r4, r8 \n\t"    \
342    "adds r10, r10, r14 \n\t"       \
343    "adcs r11, r11, r9 \n\t"        \
344    "adc r12, r12, #0 \n\t"         \
345    "umull r14, r9, r5, r7 \n\t"    \
346    "adds r10, r10, r14 \n\t"       \
347    "adcs r11, r11, r9 \n\t"        \
348    "adc r12, r12, #0 \n\t"         \
349    "umull r14, r9, r3, r6 \n\t"    \
350    "adds r10, r10, r14 \n\t"       \
351    "adcs r11, r11, r9 \n\t"        \
352    "adc r12, r12, #0 \n\t"         \
353    "ldr r14, [r0] \n\t"            \
354    "adds r10, r10, r14 \n\t"       \
355    "adcs r11, r11, #0 \n\t"        \
356    "adc r12, r12, #0 \n\t"         \
357    "stmia r0!, {r10} \n\t"         \
358                                    \
359    "ldmia r1!, {r4} \n\t"          \
360    "mov r14, #0 \n\t"              \
361    "umull r9, r10, r5, r8 \n\t"    \
362    "adds r11, r11, r9 \n\t"        \
363    "adcs r12, r12, r10 \n\t"       \
364    "adc r14, r14, #0 \n\t"         \
365    "umull r9, r10, r3, r7 \n\t"    \
366    "adds r11, r11, r9 \n\t"        \
367    "adcs r12, r12, r10 \n\t"       \
368    "adc r14, r14, #0 \n\t"         \
369    "umull r9, r10, r4, r6 \n\t"    \
370    "adds r11, r11, r9 \n\t"        \
371    "adcs r12, r12, r10 \n\t"       \
372    "adc r14, r14, #0 \n\t"         \
373    "ldr r9, [r0] \n\t"             \
374    "adds r11, r11, r9 \n\t"        \
375    "adcs r12, r12, #0 \n\t"        \
376    "adc r14, r14, #0 \n\t"         \
377    "stmia r0!, {r11} \n\t"         \
378                                    \
379    "ldmia r1!, {r5} \n\t"          \
380    "mov r9, #0 \n\t"               \
381    "umull r10, r11, r3, r8 \n\t"   \
382    "adds r12, r12, r10 \n\t"       \
383    "adcs r14, r14, r11 \n\t"       \
384    "adc r9, r9, #0 \n\t"           \
385    "umull r10, r11, r4, r7 \n\t"   \
386    "adds r12, r12, r10 \n\t"       \
387    "adcs r14, r14, r11 \n\t"       \
388    "adc r9, r9, #0 \n\t"           \
389    "umull r10, r11, r5, r6 \n\t"   \
390    "adds r12, r12, r10 \n\t"       \
391    "adcs r14, r14, r11 \n\t"       \
392    "adc r9, r9, #0 \n\t"           \
393    "ldr r10, [r0] \n\t"            \
394    "adds r12, r12, r10 \n\t"       \
395    "adcs r14, r14, #0 \n\t"        \
396    "adc r9, r9, #0 \n\t"           \
397    "stmia r0!, {r12} \n\t"         \
398                                    \
399    "ldmia r2!, {r6} \n\t"          \
400    "mov r10, #0 \n\t"              \
401    "umull r11, r12, r3, r6 \n\t"   \
402    "adds r14, r14, r11 \n\t"       \
403    "adcs r9, r9, r12 \n\t"         \
404    "adc r10, r10, #0 \n\t"         \
405    "umull r11, r12, r4, r8 \n\t"   \
406    "adds r14, r14, r11 \n\t"       \
407    "adcs r9, r9, r12 \n\t"         \
408    "adc r10, r10, #0 \n\t"         \
409    "umull r11, r12, r5, r7 \n\t"   \
410    "adds r14, r14, r11 \n\t"       \
411    "adcs r9, r9, r12 \n\t"         \
412    "adc r10, r10, #0 \n\t"         \
413    "ldr r11, [r0] \n\t"            \
414    "adds r14, r14, r11 \n\t"       \
415    "adcs r9, r9, #0 \n\t"          \
416    "adc r10, r10, #0 \n\t"         \
417    "stmia r0!, {r14} \n\t"         \
418                                    \
419    "ldmia r2!, {r7} \n\t"          \
420    "mov r11, #0 \n\t"              \
421    "umull r12, r14, r3, r7 \n\t"   \
422    "adds r9, r9, r12 \n\t"         \
423    "adcs r10, r10, r14 \n\t"       \
424    "adc r11, r11, #0 \n\t"         \
425    "umull r12, r14, r4, r6 \n\t"   \
426    "adds r9, r9, r12 \n\t"         \
427    "adcs r10, r10, r14 \n\t"       \
428    "adc r11, r11, #0 \n\t"         \
429    "umull r12, r14, r5, r8 \n\t"   \
430    "adds r9, r9, r12 \n\t"         \
431    "adcs r10, r10, r14 \n\t"       \
432    "adc r11, r11, #0 \n\t"         \
433    "ldr r12, [r0] \n\t"            \
434    "adds r9, r9, r12 \n\t"         \
435    "adcs r10, r10, #0 \n\t"        \
436    "adc r11, r11, #0 \n\t"         \
437    "stmia r0!, {r9} \n\t"          \
438                                    \
439    "ldmia r2!, {r8} \n\t"          \
440    "mov r12, #0 \n\t"              \
441    "umull r14, r9, r3, r8 \n\t"    \
442    "adds r10, r10, r14 \n\t"       \
443    "adcs r11, r11, r9 \n\t"        \
444    "adc r12, r12, #0 \n\t"         \
445    "umull r14, r9, r4, r7 \n\t"    \
446    "adds r10, r10, r14 \n\t"       \
447    "adcs r11, r11, r9 \n\t"        \
448    "adc r12, r12, #0 \n\t"         \
449    "umull r14, r9, r5, r6 \n\t"    \
450    "adds r10, r10, r14 \n\t"       \
451    "adcs r11, r11, r9 \n\t"        \
452    "adc r12, r12, #0 \n\t"         \
453    "ldr r14, [r0] \n\t"            \
454    "adds r10, r10, r14 \n\t"       \
455    "adcs r11, r11, #0 \n\t"        \
456    "adc r12, r12, #0 \n\t"         \
457    "stmia r0!, {r10} \n\t"         \
458                                    \
459    "mov r14, #0 \n\t"              \
460    "umull r9, r10, r4, r8 \n\t"    \
461    "adds r11, r11, r9 \n\t"        \
462    "adcs r12, r12, r10 \n\t"       \
463    "adc r14, r14, #0 \n\t"         \
464    "umull r9, r10, r5, r7 \n\t"    \
465    "adds r11, r11, r9 \n\t"        \
466    "adcs r12, r12, r10 \n\t"       \
467    "adc r14, r14, #0 \n\t"         \
468    "stmia r0!, {r11} \n\t"         \
469                                    \
470    "umull r10, r11, r5, r8 \n\t"   \
471    "adds r12, r12, r10 \n\t"       \
472    "adc r14, r14, r11 \n\t"        \
473    "stmia r0!, {r12, r14} \n\t"    \
474    "pop {r3} \n\t"
475
476#define FAST_MULT_ASM_6_TO_7                    \
477    "cmp r3, #6 \n\t"                           \
478    "beq 1f \n\t"                               \
479                                                \
480    /* r4 = left high, r5 = right high */       \
481    "ldr r4, [r1] \n\t"                         \
482    "ldr r5, [r2] \n\t"                         \
483                                                \
484    "sub r0, #24 \n\t"                          \
485    "sub r1, #24 \n\t"                          \
486    "sub r2, #24 \n\t"                          \
487                                                \
488    "ldr r6, [r0] \n\t"                         \
489    "ldr r7, [r1], #4 \n\t"                     \
490    "ldr r8, [r2], #4 \n\t"                     \
491    "mov r14, #0 \n\t"                          \
492    "umull r9, r10, r4, r8 \n\t"                \
493    "umull r11, r12, r5, r7 \n\t"               \
494    "adds r9, r9, r6 \n\t"                      \
495    "adc r10, r10, #0 \n\t"                     \
496    "adds r9, r9, r11 \n\t"                     \
497    "adcs r10, r10, r12 \n\t"                   \
498    "adc r14, r14, #0 \n\t"                     \
499    "str r9, [r0], #4 \n\t"                     \
500                                                \
501    "ldr r6, [r0] \n\t"                         \
502    "adds r10, r10, r6 \n\t"                    \
503    "adcs r14, r14, #0 \n\t"                    \
504    "ldr r7, [r1], #4 \n\t"                     \
505    "ldr r8, [r2], #4 \n\t"                     \
506    "mov r9, #0 \n\t"                           \
507    "umull r11, r12, r4, r8 \n\t"               \
508    "adds r10, r10, r11 \n\t"                   \
509    "adcs r14, r14, r12 \n\t"                   \
510    "adc r9, r9, #0 \n\t"                       \
511    "umull r11, r12, r5, r7 \n\t"               \
512    "adds r10, r10, r11 \n\t"                   \
513    "adcs r14, r14, r12 \n\t"                   \
514    "adc r9, r9, #0 \n\t"                       \
515    "str r10, [r0], #4 \n\t"                    \
516                                                \
517    "ldr r6, [r0] \n\t"                         \
518    "adds r14, r14, r6 \n\t"                    \
519    "adcs r9, r9, #0 \n\t"                      \
520    "ldr r7, [r1], #4 \n\t"                     \
521    "ldr r8, [r2], #4 \n\t"                     \
522    "mov r10, #0 \n\t"                          \
523    "umull r11, r12, r4, r8 \n\t"               \
524    "adds r14, r14, r11 \n\t"                   \
525    "adcs r9, r9, r12 \n\t"                     \
526    "adc r10, r10, #0 \n\t"                     \
527    "umull r11, r12, r5, r7 \n\t"               \
528    "adds r14, r14, r11 \n\t"                   \
529    "adcs r9, r9, r12 \n\t"                     \
530    "adc r10, r10, #0 \n\t"                     \
531    "str r14, [r0], #4 \n\t"                    \
532                                                \
533    "ldr r6, [r0] \n\t"                         \
534    "adds r9, r9, r6 \n\t"                      \
535    "adcs r10, r10, #0 \n\t"                    \
536    "ldr r7, [r1], #4 \n\t"                     \
537    "ldr r8, [r2], #4 \n\t"                     \
538    "mov r14, #0 \n\t"                          \
539    "umull r11, r12, r4, r8 \n\t"               \
540    "adds r9, r9, r11 \n\t"                     \
541    "adcs r10, r10, r12 \n\t"                   \
542    "adc r14, r14, #0 \n\t"                     \
543    "umull r11, r12, r5, r7 \n\t"               \
544    "adds r9, r9, r11 \n\t"                     \
545    "adcs r10, r10, r12 \n\t"                   \
546    "adc r14, r14, #0 \n\t"                     \
547    "str r9, [r0], #4 \n\t"                     \
548                                                \
549    "ldr r6, [r0] \n\t"                         \
550    "adds r10, r10, r6 \n\t"                    \
551    "adcs r14, r14, #0 \n\t"                    \
552    "ldr r7, [r1], #4 \n\t"                     \
553    "ldr r8, [r2], #4 \n\t"                     \
554    "mov r9, #0 \n\t"                           \
555    "umull r11, r12, r4, r8 \n\t"               \
556    "adds r10, r10, r11 \n\t"                   \
557    "adcs r14, r14, r12 \n\t"                   \
558    "adc r9, r9, #0 \n\t"                       \
559    "umull r11, r12, r5, r7 \n\t"               \
560    "adds r10, r10, r11 \n\t"                   \
561    "adcs r14, r14, r12 \n\t"                   \
562    "adc r9, r9, #0 \n\t"                       \
563    "str r10, [r0], #4 \n\t"                    \
564                                                \
565    "ldr r6, [r0] \n\t"                         \
566    "adds r14, r14, r6 \n\t"                    \
567    "adcs r9, r9, #0 \n\t"                      \
568    /* skip past already-loaded (r4, r5) */     \
569    "ldr r7, [r1], #8 \n\t"                     \
570    "ldr r8, [r2], #8 \n\t"                     \
571    "mov r10, #0 \n\t"                          \
572    "umull r11, r12, r4, r8 \n\t"               \
573    "adds r14, r14, r11 \n\t"                   \
574    "adcs r9, r9, r12 \n\t"                     \
575    "adc r10, r10, #0 \n\t"                     \
576    "umull r11, r12, r5, r7 \n\t"               \
577    "adds r14, r14, r11 \n\t"                   \
578    "adcs r9, r9, r12 \n\t"                     \
579    "adc r10, r10, #0 \n\t"                     \
580    "str r14, [r0], #4 \n\t"                    \
581                                                \
582    "umull r11, r12, r4, r5 \n\t"               \
583    "adds r11, r11, r9 \n\t"                    \
584    "adc r12, r12, r10 \n\t"                    \
585    "stmia r0!, {r11, r12} \n\t"
586
587#define FAST_MULT_ASM_7                \
588    "push {r3} \n\t"                   \
589    "add r0, 24 \n\t"                  \
590    "add r2, 24 \n\t"                  \
591    "ldmia r1!, {r3} \n\t"             \
592    "ldmia r2!, {r6} \n\t"             \
593                                       \
594    "umull r9, r10, r3, r6 \n\t"       \
595    "stmia r0!, {r9, r10} \n\t"        \
596                                       \
597    "sub r0, 20 \n\t"                  \
598    "sub r2, 16 \n\t"                  \
599    "ldmia r2!, {r6, r7, r8} \n\t"     \
600    "ldmia r1!, {r4, r5} \n\t"         \
601                                       \
602    "umull r9, r10, r3, r6 \n\t"       \
603    "stmia r0!, {r9} \n\t"             \
604                                       \
605    "mov r14, #0 \n\t"                 \
606    "umull r9, r12, r3, r7 \n\t"       \
607    "adds r10, r10, r9 \n\t"           \
608    "adc r12, r12, #0 \n\t"            \
609    "umull r9, r11, r4, r6 \n\t"       \
610    "adds r10, r10, r9 \n\t"           \
611    "adcs r12, r12, r11 \n\t"          \
612    "adc r14, r14, #0 \n\t"            \
613    "stmia r0!, {r10} \n\t"            \
614                                       \
615    "mov r9, #0 \n\t"                  \
616    "umull r10, r11, r3, r8 \n\t"      \
617    "adds r12, r12, r10 \n\t"          \
618    "adcs r14, r14, r11 \n\t"          \
619    "adc r9, r9, #0 \n\t"              \
620    "umull r10, r11, r4, r7 \n\t"      \
621    "adds r12, r12, r10 \n\t"          \
622    "adcs r14, r14, r11 \n\t"          \
623    "adc r9, r9, #0 \n\t"              \
624    "umull r10, r11, r5, r6 \n\t"      \
625    "adds r12, r12, r10 \n\t"          \
626    "adcs r14, r14, r11 \n\t"          \
627    "adc r9, r9, #0 \n\t"              \
628    "stmia r0!, {r12} \n\t"            \
629                                       \
630    "ldmia r1!, {r3} \n\t"             \
631    "mov r10, #0 \n\t"                 \
632    "umull r11, r12, r4, r8 \n\t"      \
633    "adds r14, r14, r11 \n\t"          \
634    "adcs r9, r9, r12 \n\t"            \
635    "adc r10, r10, #0 \n\t"            \
636    "umull r11, r12, r5, r7 \n\t"      \
637    "adds r14, r14, r11 \n\t"          \
638    "adcs r9, r9, r12 \n\t"            \
639    "adc r10, r10, #0 \n\t"            \
640    "umull r11, r12, r3, r6 \n\t"      \
641    "adds r14, r14, r11 \n\t"          \
642    "adcs r9, r9, r12 \n\t"            \
643    "adc r10, r10, #0 \n\t"            \
644    "ldr r11, [r0] \n\t"               \
645    "adds r14, r14, r11 \n\t"          \
646    "adcs r9, r9, #0 \n\t"             \
647    "adc r10, r10, #0 \n\t"            \
648    "stmia r0!, {r14} \n\t"            \
649                                       \
650    "ldmia r2!, {r6} \n\t"             \
651    "mov r11, #0 \n\t"                 \
652    "umull r12, r14, r4, r6 \n\t"      \
653    "adds r9, r9, r12 \n\t"            \
654    "adcs r10, r10, r14 \n\t"          \
655    "adc r11, r11, #0 \n\t"            \
656    "umull r12, r14, r5, r8 \n\t"      \
657    "adds r9, r9, r12 \n\t"            \
658    "adcs r10, r10, r14 \n\t"          \
659    "adc r11, r11, #0 \n\t"            \
660    "umull r12, r14, r3, r7 \n\t"      \
661    "adds r9, r9, r12 \n\t"            \
662    "adcs r10, r10, r14 \n\t"          \
663    "adc r11, r11, #0 \n\t"            \
664    "ldr r12, [r0] \n\t"               \
665    "adds r9, r9, r12 \n\t"            \
666    "adcs r10, r10, #0 \n\t"           \
667    "adc r11, r11, #0 \n\t"            \
668    "stmia r0!, {r9} \n\t"             \
669                                       \
670    "mov r12, #0 \n\t"                 \
671    "umull r14, r9, r5, r6 \n\t"       \
672    "adds r10, r10, r14 \n\t"          \
673    "adcs r11, r11, r9 \n\t"           \
674    "adc r12, r12, #0 \n\t"            \
675    "umull r14, r9, r3, r8 \n\t"       \
676    "adds r10, r10, r14 \n\t"          \
677    "adcs r11, r11, r9 \n\t"           \
678    "adc r12, r12, #0 \n\t"            \
679    "stmia r0!, {r10} \n\t"            \
680                                       \
681    "umull r9, r10, r3, r6 \n\t"       \
682    "adds r11, r11, r9 \n\t"           \
683    "adc r12, r12, r10 \n\t"           \
684    "stmia r0!, {r11, r12} \n\t"       \
685                                       \
686    "sub r0, 44 \n\t"                  \
687    "sub r1, 16 \n\t"                  \
688    "sub r2, 28 \n\t"                  \
689    "ldmia r1!, {r3,r4,r5} \n\t"       \
690    "ldmia r2!, {r6,r7,r8} \n\t"       \
691                                       \
692    "umull r9, r10, r3, r6 \n\t"       \
693    "stmia r0!, {r9} \n\t"             \
694                                       \
695    "mov r14, #0 \n\t"                 \
696    "umull r9, r12, r3, r7 \n\t"       \
697    "adds r10, r10, r9 \n\t"           \
698    "adc r12, r12, #0 \n\t"            \
699    "umull r9, r11, r4, r6 \n\t"       \
700    "adds r10, r10, r9 \n\t"           \
701    "adcs r12, r12, r11 \n\t"          \
702    "adc r14, r14, #0 \n\t"            \
703    "stmia r0!, {r10} \n\t"            \
704                                       \
705    "mov r9, #0 \n\t"                  \
706    "umull r10, r11, r3, r8 \n\t"      \
707    "adds r12, r12, r10 \n\t"          \
708    "adcs r14, r14, r11 \n\t"          \
709    "adc r9, r9, #0 \n\t"              \
710    "umull r10, r11, r4, r7 \n\t"      \
711    "adds r12, r12, r10 \n\t"          \
712    "adcs r14, r14, r11 \n\t"          \
713    "adc r9, r9, #0 \n\t"              \
714    "umull r10, r11, r5, r6 \n\t"      \
715    "adds r12, r12, r10 \n\t"          \
716    "adcs r14, r14, r11 \n\t"          \
717    "adc r9, r9, #0 \n\t"              \
718    "stmia r0!, {r12} \n\t"            \
719                                       \
720    "ldmia r1!, {r3} \n\t"             \
721    "mov r10, #0 \n\t"                 \
722    "umull r11, r12, r4, r8 \n\t"      \
723    "adds r14, r14, r11 \n\t"          \
724    "adcs r9, r9, r12 \n\t"            \
725    "adc r10, r10, #0 \n\t"            \
726    "umull r11, r12, r5, r7 \n\t"      \
727    "adds r14, r14, r11 \n\t"          \
728    "adcs r9, r9, r12 \n\t"            \
729    "adc r10, r10, #0 \n\t"            \
730    "umull r11, r12, r3, r6 \n\t"      \
731    "adds r14, r14, r11 \n\t"          \
732    "adcs r9, r9, r12 \n\t"            \
733    "adc r10, r10, #0 \n\t"            \
734    "ldr r11, [r0] \n\t"               \
735    "adds r14, r14, r11 \n\t"          \
736    "adcs r9, r9, #0 \n\t"             \
737    "adc r10, r10, #0 \n\t"            \
738    "stmia r0!, {r14} \n\t"            \
739                                       \
740    "ldmia r1!, {r4} \n\t"             \
741    "mov r11, #0 \n\t"                 \
742    "umull r12, r14, r5, r8 \n\t"      \
743    "adds r9, r9, r12 \n\t"            \
744    "adcs r10, r10, r14 \n\t"          \
745    "adc r11, r11, #0 \n\t"            \
746    "umull r12, r14, r3, r7 \n\t"      \
747    "adds r9, r9, r12 \n\t"            \
748    "adcs r10, r10, r14 \n\t"          \
749    "adc r11, r11, #0 \n\t"            \
750    "umull r12, r14, r4, r6 \n\t"      \
751    "adds r9, r9, r12 \n\t"            \
752    "adcs r10, r10, r14 \n\t"          \
753    "adc r11, r11, #0 \n\t"            \
754    "ldr r12, [r0] \n\t"               \
755    "adds r9, r9, r12 \n\t"            \
756    "adcs r10, r10, #0 \n\t"           \
757    "adc r11, r11, #0 \n\t"            \
758    "stmia r0!, {r9} \n\t"             \
759                                       \
760    "ldmia r1!, {r5} \n\t"             \
761    "mov r12, #0 \n\t"                 \
762    "umull r14, r9, r3, r8 \n\t"       \
763    "adds r10, r10, r14 \n\t"          \
764    "adcs r11, r11, r9 \n\t"           \
765    "adc r12, r12, #0 \n\t"            \
766    "umull r14, r9, r4, r7 \n\t"       \
767    "adds r10, r10, r14 \n\t"          \
768    "adcs r11, r11, r9 \n\t"           \
769    "adc r12, r12, #0 \n\t"            \
770    "umull r14, r9, r5, r6 \n\t"       \
771    "adds r10, r10, r14 \n\t"          \
772    "adcs r11, r11, r9 \n\t"           \
773    "adc r12, r12, #0 \n\t"            \
774    "ldr r14, [r0] \n\t"               \
775    "adds r10, r10, r14 \n\t"          \
776    "adcs r11, r11, #0 \n\t"           \
777    "adc r12, r12, #0 \n\t"            \
778    "stmia r0!, {r10} \n\t"            \
779                                       \
780    "ldmia r1!, {r3} \n\t"             \
781    "mov r14, #0 \n\t"                 \
782    "umull r9, r10, r4, r8 \n\t"       \
783    "adds r11, r11, r9 \n\t"           \
784    "adcs r12, r12, r10 \n\t"          \
785    "adc r14, r14, #0 \n\t"            \
786    "umull r9, r10, r5, r7 \n\t"       \
787    "adds r11, r11, r9 \n\t"           \
788    "adcs r12, r12, r10 \n\t"          \
789    "adc r14, r14, #0 \n\t"            \
790    "umull r9, r10, r3, r6 \n\t"       \
791    "adds r11, r11, r9 \n\t"           \
792    "adcs r12, r12, r10 \n\t"          \
793    "adc r14, r14, #0 \n\t"            \
794    "ldr r9, [r0] \n\t"                \
795    "adds r11, r11, r9 \n\t"           \
796    "adcs r12, r12, #0 \n\t"           \
797    "adc r14, r14, #0 \n\t"            \
798    "stmia r0!, {r11} \n\t"            \
799                                       \
800    "ldmia r2!, {r6} \n\t"             \
801    "mov r9, #0 \n\t"                  \
802    "umull r10, r11, r4, r6 \n\t"      \
803    "adds r12, r12, r10 \n\t"          \
804    "adcs r14, r14, r11 \n\t"          \
805    "adc r9, r9, #0 \n\t"              \
806    "umull r10, r11, r5, r8 \n\t"      \
807    "adds r12, r12, r10 \n\t"          \
808    "adcs r14, r14, r11 \n\t"          \
809    "adc r9, r9, #0 \n\t"              \
810    "umull r10, r11, r3, r7 \n\t"      \
811    "adds r12, r12, r10 \n\t"          \
812    "adcs r14, r14, r11 \n\t"          \
813    "adc r9, r9, #0 \n\t"              \
814    "ldr r10, [r0] \n\t"               \
815    "adds r12, r12, r10 \n\t"          \
816    "adcs r14, r14, #0 \n\t"           \
817    "adc r9, r9, #0 \n\t"              \
818    "stmia r0!, {r12} \n\t"            \
819                                       \
820    "ldmia r2!, {r7} \n\t"             \
821    "mov r10, #0 \n\t"                 \
822    "umull r11, r12, r4, r7 \n\t"      \
823    "adds r14, r14, r11 \n\t"          \
824    "adcs r9, r9, r12 \n\t"            \
825    "adc r10, r10, #0 \n\t"            \
826    "umull r11, r12, r5, r6 \n\t"      \
827    "adds r14, r14, r11 \n\t"          \
828    "adcs r9, r9, r12 \n\t"            \
829    "adc r10, r10, #0 \n\t"            \
830    "umull r11, r12, r3, r8 \n\t"      \
831    "adds r14, r14, r11 \n\t"          \
832    "adcs r9, r9, r12 \n\t"            \
833    "adc r10, r10, #0 \n\t"            \
834    "ldr r11, [r0] \n\t"               \
835    "adds r14, r14, r11 \n\t"          \
836    "adcs r9, r9, #0 \n\t"             \
837    "adc r10, r10, #0 \n\t"            \
838    "stmia r0!, {r14} \n\t"            \
839                                       \
840    "ldmia r2!, {r8} \n\t"             \
841    "mov r11, #0 \n\t"                 \
842    "umull r12, r14, r4, r8 \n\t"      \
843    "adds r9, r9, r12 \n\t"            \
844    "adcs r10, r10, r14 \n\t"          \
845    "adc r11, r11, #0 \n\t"            \
846    "umull r12, r14, r5, r7 \n\t"      \
847    "adds r9, r9, r12 \n\t"            \
848    "adcs r10, r10, r14 \n\t"          \
849    "adc r11, r11, #0 \n\t"            \
850    "umull r12, r14, r3, r6 \n\t"      \
851    "adds r9, r9, r12 \n\t"            \
852    "adcs r10, r10, r14 \n\t"          \
853    "adc r11, r11, #0 \n\t"            \
854    "ldr r12, [r0] \n\t"               \
855    "adds r9, r9, r12 \n\t"            \
856    "adcs r10, r10, #0 \n\t"           \
857    "adc r11, r11, #0 \n\t"            \
858    "stmia r0!, {r9} \n\t"             \
859                                       \
860    "ldmia r2!, {r6} \n\t"             \
861    "mov r12, #0 \n\t"                 \
862    "umull r14, r9, r4, r6 \n\t"       \
863    "adds r10, r10, r14 \n\t"          \
864    "adcs r11, r11, r9 \n\t"           \
865    "adc r12, r12, #0 \n\t"            \
866    "umull r14, r9, r5, r8 \n\t"       \
867    "adds r10, r10, r14 \n\t"          \
868    "adcs r11, r11, r9 \n\t"           \
869    "adc r12, r12, #0 \n\t"            \
870    "umull r14, r9, r3, r7 \n\t"       \
871    "adds r10, r10, r14 \n\t"          \
872    "adcs r11, r11, r9 \n\t"           \
873    "adc r12, r12, #0 \n\t"            \
874    "ldr r14, [r0] \n\t"               \
875    "adds r10, r10, r14 \n\t"          \
876    "adcs r11, r11, #0 \n\t"           \
877    "adc r12, r12, #0 \n\t"            \
878    "stmia r0!, {r10} \n\t"            \
879                                       \
880    "mov r14, #0 \n\t"                 \
881    "umull r9, r10, r5, r6 \n\t"       \
882    "adds r11, r11, r9 \n\t"           \
883    "adcs r12, r12, r10 \n\t"          \
884    "adc r14, r14, #0 \n\t"            \
885    "umull r9, r10, r3, r8 \n\t"       \
886    "adds r11, r11, r9 \n\t"           \
887    "adcs r12, r12, r10 \n\t"          \
888    "adc r14, r14, #0 \n\t"            \
889    "stmia r0!, {r11} \n\t"            \
890                                       \
891    "umull r10, r11, r3, r6 \n\t"      \
892    "adds r12, r12, r10 \n\t"          \
893    "adc r14, r14, r11 \n\t"           \
894    "stmia r0!, {r12, r14} \n\t"       \
895    "pop {r3} \n\t"
896
897#define FAST_MULT_ASM_7_TO_8                 \
898    "cmp r3, #7 \n\t"                        \
899    "beq 1f \n\t"                            \
900                                             \
901    /* r4 = left high, r5 = right high */    \
902    "ldr r4, [r1] \n\t"                      \
903    "ldr r5, [r2] \n\t"                      \
904                                             \
905    "sub r0, #28 \n\t"                       \
906    "sub r1, #28 \n\t"                       \
907    "sub r2, #28 \n\t"                       \
908                                             \
909    "ldr r6, [r0] \n\t"                      \
910    "ldr r7, [r1], #4 \n\t"                  \
911    "ldr r8, [r2], #4 \n\t"                  \
912    "mov r14, #0 \n\t"                       \
913    "umull r9, r10, r4, r8 \n\t"             \
914    "umull r11, r12, r5, r7 \n\t"            \
915    "adds r9, r9, r6 \n\t"                   \
916    "adc r10, r10, #0 \n\t"                  \
917    "adds r9, r9, r11 \n\t"                  \
918    "adcs r10, r10, r12 \n\t"                \
919    "adc r14, r14, #0 \n\t"                  \
920    "str r9, [r0], #4 \n\t"                  \
921                                             \
922    "ldr r6, [r0] \n\t"                      \
923    "adds r10, r10, r6 \n\t"                 \
924    "adcs r14, r14, #0 \n\t"                 \
925    "ldr r7, [r1], #4 \n\t"                  \
926    "ldr r8, [r2], #4 \n\t"                  \
927    "mov r9, #0 \n\t"                        \
928    "umull r11, r12, r4, r8 \n\t"            \
929    "adds r10, r10, r11 \n\t"                \
930    "adcs r14, r14, r12 \n\t"                \
931    "adc r9, r9, #0 \n\t"                    \
932    "umull r11, r12, r5, r7 \n\t"            \
933    "adds r10, r10, r11 \n\t"                \
934    "adcs r14, r14, r12 \n\t"                \
935    "adc r9, r9, #0 \n\t"                    \
936    "str r10, [r0], #4 \n\t"                 \
937                                             \
938    "ldr r6, [r0] \n\t"                      \
939    "adds r14, r14, r6 \n\t"                 \
940    "adcs r9, r9, #0 \n\t"                   \
941    "ldr r7, [r1], #4 \n\t"                  \
942    "ldr r8, [r2], #4 \n\t"                  \
943    "mov r10, #0 \n\t"                       \
944    "umull r11, r12, r4, r8 \n\t"            \
945    "adds r14, r14, r11 \n\t"                \
946    "adcs r9, r9, r12 \n\t"                  \
947    "adc r10, r10, #0 \n\t"                  \
948    "umull r11, r12, r5, r7 \n\t"            \
949    "adds r14, r14, r11 \n\t"                \
950    "adcs r9, r9, r12 \n\t"                  \
951    "adc r10, r10, #0 \n\t"                  \
952    "str r14, [r0], #4 \n\t"                 \
953                                             \
954    "ldr r6, [r0] \n\t"                      \
955    "adds r9, r9, r6 \n\t"                   \
956    "adcs r10, r10, #0 \n\t"                 \
957    "ldr r7, [r1], #4 \n\t"                  \
958    "ldr r8, [r2], #4 \n\t"                  \
959    "mov r14, #0 \n\t"                       \
960    "umull r11, r12, r4, r8 \n\t"            \
961    "adds r9, r9, r11 \n\t"                  \
962    "adcs r10, r10, r12 \n\t"                \
963    "adc r14, r14, #0 \n\t"                  \
964    "umull r11, r12, r5, r7 \n\t"            \
965    "adds r9, r9, r11 \n\t"                  \
966    "adcs r10, r10, r12 \n\t"                \
967    "adc r14, r14, #0 \n\t"                  \
968    "str r9, [r0], #4 \n\t"                  \
969                                             \
970    "ldr r6, [r0] \n\t"                      \
971    "adds r10, r10, r6 \n\t"                 \
972    "adcs r14, r14, #0 \n\t"                 \
973    "ldr r7, [r1], #4 \n\t"                  \
974    "ldr r8, [r2], #4 \n\t"                  \
975    "mov r9, #0 \n\t"                        \
976    "umull r11, r12, r4, r8 \n\t"            \
977    "adds r10, r10, r11 \n\t"                \
978    "adcs r14, r14, r12 \n\t"                \
979    "adc r9, r9, #0 \n\t"                    \
980    "umull r11, r12, r5, r7 \n\t"            \
981    "adds r10, r10, r11 \n\t"                \
982    "adcs r14, r14, r12 \n\t"                \
983    "adc r9, r9, #0 \n\t"                    \
984    "str r10, [r0], #4 \n\t"                 \
985                                             \
986    "ldr r6, [r0] \n\t"                      \
987    "adds r14, r14, r6 \n\t"                 \
988    "adcs r9, r9, #0 \n\t"                   \
989    "ldr r7, [r1], #4 \n\t"                  \
990    "ldr r8, [r2], #4 \n\t"                  \
991    "mov r10, #0 \n\t"                       \
992    "umull r11, r12, r4, r8 \n\t"            \
993    "adds r14, r14, r11 \n\t"                \
994    "adcs r9, r9, r12 \n\t"                  \
995    "adc r10, r10, #0 \n\t"                  \
996    "umull r11, r12, r5, r7 \n\t"            \
997    "adds r14, r14, r11 \n\t"                \
998    "adcs r9, r9, r12 \n\t"                  \
999    "adc r10, r10, #0 \n\t"                  \
1000    "str r14, [r0], #4 \n\t"                 \
1001                                             \
1002    "ldr r6, [r0] \n\t"                      \
1003    "adds r9, r9, r6 \n\t"                   \
1004    "adcs r10, r10, #0 \n\t"                 \
1005    /* skip past already-loaded (r4, r5) */  \
1006    "ldr r7, [r1], #8 \n\t"                  \
1007    "ldr r8, [r2], #8 \n\t"                  \
1008    "mov r14, #0 \n\t"                       \
1009    "umull r11, r12, r4, r8 \n\t"            \
1010    "adds r9, r9, r11 \n\t"                  \
1011    "adcs r10, r10, r12 \n\t"                \
1012    "adc r14, r14, #0 \n\t"                  \
1013    "umull r11, r12, r5, r7 \n\t"            \
1014    "adds r9, r9, r11 \n\t"                  \
1015    "adcs r10, r10, r12 \n\t"                \
1016    "adc r14, r14, #0 \n\t"                  \
1017    "str r9, [r0], #4 \n\t"                  \
1018                                             \
1019    "umull r11, r12, r4, r5 \n\t"            \
1020    "adds r11, r11, r10 \n\t"                \
1021    "adc r12, r12, r14 \n\t"                 \
1022    "stmia r0!, {r11, r12} \n\t"
1023
1024#define FAST_MULT_ASM_8             \
1025    "push {r3} \n\t"                \
1026    "add r0, 24 \n\t"               \
1027    "add r2, 24 \n\t"               \
1028    "ldmia r1!, {r3,r4} \n\t"       \
1029    "ldmia r2!, {r6,r7} \n\t"       \
1030                                    \
1031    "umull r11, r12, r3, r6 \n\t"   \
1032    "stmia r0!, {r11} \n\t"         \
1033                                    \
1034    "mov r10, #0 \n\t"              \
1035    "umull r11, r9, r3, r7 \n\t"    \
1036    "adds r12, r12, r11 \n\t"       \
1037    "adc r9, r9, #0 \n\t"           \
1038    "umull r11, r14, r4, r6 \n\t"   \
1039    "adds r12, r12, r11 \n\t"       \
1040    "adcs r9, r9, r14 \n\t"         \
1041    "adc r10, r10, #0 \n\t"         \
1042    "stmia r0!, {r12} \n\t"         \
1043                                    \
1044    "umull r12, r14, r4, r7 \n\t"   \
1045    "adds r9, r9, r12 \n\t"         \
1046    "adc r10, r10, r14 \n\t"        \
1047    "stmia r0!, {r9, r10} \n\t"     \
1048                                    \
1049    "sub r0, 28 \n\t"               \
1050    "sub r2, 20 \n\t"               \
1051    "ldmia r2!, {r6,r7,r8} \n\t"    \
1052    "ldmia r1!, {r5} \n\t"          \
1053                                    \
1054    "umull r11, r12, r3, r6 \n\t"   \
1055    "stmia r0!, {r11} \n\t"         \
1056                                    \
1057    "mov r10, #0 \n\t"              \
1058    "umull r11, r9, r3, r7 \n\t"    \
1059    "adds r12, r12, r11 \n\t"       \
1060    "adc r9, r9, #0 \n\t"           \
1061    "umull r11, r14, r4, r6 \n\t"   \
1062    "adds r12, r12, r11 \n\t"       \
1063    "adcs r9, r9, r14 \n\t"         \
1064    "adc r10, r10, #0 \n\t"         \
1065    "stmia r0!, {r12} \n\t"         \
1066                                    \
1067    "mov r11, #0 \n\t"              \
1068    "umull r12, r14, r3, r8 \n\t"   \
1069    "adds r9, r9, r12 \n\t"         \
1070    "adcs r10, r10, r14 \n\t"       \
1071    "adc r11, r11, #0 \n\t"         \
1072    "umull r12, r14, r4, r7 \n\t"   \
1073    "adds r9, r9, r12 \n\t"         \
1074    "adcs r10, r10, r14 \n\t"       \
1075    "adc r11, r11, #0 \n\t"         \
1076    "umull r12, r14, r5, r6 \n\t"   \
1077    "adds r9, r9, r12 \n\t"         \
1078    "adcs r10, r10, r14 \n\t"       \
1079    "adc r11, r11, #0 \n\t"         \
1080    "stmia r0!, {r9} \n\t"          \
1081                                    \
1082    "ldmia r1!, {r3} \n\t"          \
1083    "mov r12, #0 \n\t"              \
1084    "umull r14, r9, r4, r8 \n\t"    \
1085    "adds r10, r10, r14 \n\t"       \
1086    "adcs r11, r11, r9 \n\t"        \
1087    "adc r12, r12, #0 \n\t"         \
1088    "umull r14, r9, r5, r7 \n\t"    \
1089    "adds r10, r10, r14 \n\t"       \
1090    "adcs r11, r11, r9 \n\t"        \
1091    "adc r12, r12, #0 \n\t"         \
1092    "umull r14, r9, r3, r6 \n\t"    \
1093    "adds r10, r10, r14 \n\t"       \
1094    "adcs r11, r11, r9 \n\t"        \
1095    "adc r12, r12, #0 \n\t"         \
1096    "ldr r14, [r0] \n\t"            \
1097    "adds r10, r10, r14 \n\t"       \
1098    "adcs r11, r11, #0 \n\t"        \
1099    "adc r12, r12, #0 \n\t"         \
1100    "stmia r0!, {r10} \n\t"         \
1101                                    \
1102    "ldmia r1!, {r4} \n\t"          \
1103    "mov r14, #0 \n\t"              \
1104    "umull r9, r10, r5, r8 \n\t"    \
1105    "adds r11, r11, r9 \n\t"        \
1106    "adcs r12, r12, r10 \n\t"       \
1107    "adc r14, r14, #0 \n\t"         \
1108    "umull r9, r10, r3, r7 \n\t"    \
1109    "adds r11, r11, r9 \n\t"        \
1110    "adcs r12, r12, r10 \n\t"       \
1111    "adc r14, r14, #0 \n\t"         \
1112    "umull r9, r10, r4, r6 \n\t"    \
1113    "adds r11, r11, r9 \n\t"        \
1114    "adcs r12, r12, r10 \n\t"       \
1115    "adc r14, r14, #0 \n\t"         \
1116    "ldr r9, [r0] \n\t"             \
1117    "adds r11, r11, r9 \n\t"        \
1118    "adcs r12, r12, #0 \n\t"        \
1119    "adc r14, r14, #0 \n\t"         \
1120    "stmia r0!, {r11} \n\t"         \
1121                                    \
1122    "ldmia r2!, {r6} \n\t"          \
1123    "mov r9, #0 \n\t"               \
1124    "umull r10, r11, r5, r6 \n\t"   \
1125    "adds r12, r12, r10 \n\t"       \
1126    "adcs r14, r14, r11 \n\t"       \
1127    "adc r9, r9, #0 \n\t"           \
1128    "umull r10, r11, r3, r8 \n\t"   \
1129    "adds r12, r12, r10 \n\t"       \
1130    "adcs r14, r14, r11 \n\t"       \
1131    "adc r9, r9, #0 \n\t"           \
1132    "umull r10, r11, r4, r7 \n\t"   \
1133    "adds r12, r12, r10 \n\t"       \
1134    "adcs r14, r14, r11 \n\t"       \
1135    "adc r9, r9, #0 \n\t"           \
1136    "ldr r10, [r0] \n\t"            \
1137    "adds r12, r12, r10 \n\t"       \
1138    "adcs r14, r14, #0 \n\t"        \
1139    "adc r9, r9, #0 \n\t"           \
1140    "stmia r0!, {r12} \n\t"         \
1141                                    \
1142    "ldmia r2!, {r7} \n\t"          \
1143    "mov r10, #0 \n\t"              \
1144    "umull r11, r12, r5, r7 \n\t"   \
1145    "adds r14, r14, r11 \n\t"       \
1146    "adcs r9, r9, r12 \n\t"         \
1147    "adc r10, r10, #0 \n\t"         \
1148    "umull r11, r12, r3, r6 \n\t"   \
1149    "adds r14, r14, r11 \n\t"       \
1150    "adcs r9, r9, r12 \n\t"         \
1151    "adc r10, r10, #0 \n\t"         \
1152    "umull r11, r12, r4, r8 \n\t"   \
1153    "adds r14, r14, r11 \n\t"       \
1154    "adcs r9, r9, r12 \n\t"         \
1155    "adc r10, r10, #0 \n\t"         \
1156    "ldr r11, [r0] \n\t"            \
1157    "adds r14, r14, r11 \n\t"       \
1158    "adcs r9, r9, #0 \n\t"          \
1159    "adc r10, r10, #0 \n\t"         \
1160    "stmia r0!, {r14} \n\t"         \
1161                                    \
1162    "mov r11, #0 \n\t"              \
1163    "umull r12, r14, r3, r7 \n\t"   \
1164    "adds r9, r9, r12 \n\t"         \
1165    "adcs r10, r10, r14 \n\t"       \
1166    "adc r11, r11, #0 \n\t"         \
1167    "umull r12, r14, r4, r6 \n\t"   \
1168    "adds r9, r9, r12 \n\t"         \
1169    "adcs r10, r10, r14 \n\t"       \
1170    "adc r11, r11, #0 \n\t"         \
1171    "stmia r0!, {r9} \n\t"          \
1172                                    \
1173    "umull r14, r9, r4, r7 \n\t"    \
1174    "adds r10, r10, r14 \n\t"       \
1175    "adc r11, r11, r9 \n\t"         \
1176    "stmia r0!, {r10, r11} \n\t"    \
1177                                    \
1178    "sub r0, 52 \n\t"               \
1179    "sub r1, 20 \n\t"               \
1180    "sub r2, 32 \n\t"               \
1181    "ldmia r1!, {r3,r4,r5} \n\t"    \
1182    "ldmia r2!, {r6,r7,r8} \n\t"    \
1183                                    \
1184    "umull r11, r12, r3, r6 \n\t"   \
1185    "stmia r0!, {r11} \n\t"         \
1186                                    \
1187    "mov r10, #0 \n\t"              \
1188    "umull r11, r9, r3, r7 \n\t"    \
1189    "adds r12, r12, r11 \n\t"       \
1190    "adc r9, r9, #0 \n\t"           \
1191    "umull r11, r14, r4, r6 \n\t"   \
1192    "adds r12, r12, r11 \n\t"       \
1193    "adcs r9, r9, r14 \n\t"         \
1194    "adc r10, r10, #0 \n\t"         \
1195    "stmia r0!, {r12} \n\t"         \
1196                                    \
1197    "mov r11, #0 \n\t"              \
1198    "umull r12, r14, r3, r8 \n\t"   \
1199    "adds r9, r9, r12 \n\t"         \
1200    "adcs r10, r10, r14 \n\t"       \
1201    "adc r11, r11, #0 \n\t"         \
1202    "umull r12, r14, r4, r7 \n\t"   \
1203    "adds r9, r9, r12 \n\t"         \
1204    "adcs r10, r10, r14 \n\t"       \
1205    "adc r11, r11, #0 \n\t"         \
1206    "umull r12, r14, r5, r6 \n\t"   \
1207    "adds r9, r9, r12 \n\t"         \
1208    "adcs r10, r10, r14 \n\t"       \
1209    "adc r11, r11, #0 \n\t"         \
1210    "stmia r0!, {r9} \n\t"          \
1211                                    \
1212    "ldmia r1!, {r3} \n\t"          \
1213    "mov r12, #0 \n\t"              \
1214    "umull r14, r9, r4, r8 \n\t"    \
1215    "adds r10, r10, r14 \n\t"       \
1216    "adcs r11, r11, r9 \n\t"        \
1217    "adc r12, r12, #0 \n\t"         \
1218    "umull r14, r9, r5, r7 \n\t"    \
1219    "adds r10, r10, r14 \n\t"       \
1220    "adcs r11, r11, r9 \n\t"        \
1221    "adc r12, r12, #0 \n\t"         \
1222    "umull r14, r9, r3, r6 \n\t"    \
1223    "adds r10, r10, r14 \n\t"       \
1224    "adcs r11, r11, r9 \n\t"        \
1225    "adc r12, r12, #0 \n\t"         \
1226    "ldr r14, [r0] \n\t"            \
1227    "adds r10, r10, r14 \n\t"       \
1228    "adcs r11, r11, #0 \n\t"        \
1229    "adc r12, r12, #0 \n\t"         \
1230    "stmia r0!, {r10} \n\t"         \
1231                                    \
1232    "ldmia r1!, {r4} \n\t"          \
1233    "mov r14, #0 \n\t"              \
1234    "umull r9, r10, r5, r8 \n\t"    \
1235    "adds r11, r11, r9 \n\t"        \
1236    "adcs r12, r12, r10 \n\t"       \
1237    "adc r14, r14, #0 \n\t"         \
1238    "umull r9, r10, r3, r7 \n\t"    \
1239    "adds r11, r11, r9 \n\t"        \
1240    "adcs r12, r12, r10 \n\t"       \
1241    "adc r14, r14, #0 \n\t"         \
1242    "umull r9, r10, r4, r6 \n\t"    \
1243    "adds r11, r11, r9 \n\t"        \
1244    "adcs r12, r12, r10 \n\t"       \
1245    "adc r14, r14, #0 \n\t"         \
1246    "ldr r9, [r0] \n\t"             \
1247    "adds r11, r11, r9 \n\t"        \
1248    "adcs r12, r12, #0 \n\t"        \
1249    "adc r14, r14, #0 \n\t"         \
1250    "stmia r0!, {r11} \n\t"         \
1251                                    \
1252    "ldmia r1!, {r5} \n\t"          \
1253    "mov r9, #0 \n\t"               \
1254    "umull r10, r11, r3, r8 \n\t"   \
1255    "adds r12, r12, r10 \n\t"       \
1256    "adcs r14, r14, r11 \n\t"       \
1257    "adc r9, r9, #0 \n\t"           \
1258    "umull r10, r11, r4, r7 \n\t"   \
1259    "adds r12, r12, r10 \n\t"       \
1260    "adcs r14, r14, r11 \n\t"       \
1261    "adc r9, r9, #0 \n\t"           \
1262    "umull r10, r11, r5, r6 \n\t"   \
1263    "adds r12, r12, r10 \n\t"       \
1264    "adcs r14, r14, r11 \n\t"       \
1265    "adc r9, r9, #0 \n\t"           \
1266    "ldr r10, [r0] \n\t"            \
1267    "adds r12, r12, r10 \n\t"       \
1268    "adcs r14, r14, #0 \n\t"        \
1269    "adc r9, r9, #0 \n\t"           \
1270    "stmia r0!, {r12} \n\t"         \
1271                                    \
1272    "ldmia r1!, {r3} \n\t"          \
1273    "mov r10, #0 \n\t"              \
1274    "umull r11, r12, r4, r8 \n\t"   \
1275    "adds r14, r14, r11 \n\t"       \
1276    "adcs r9, r9, r12 \n\t"         \
1277    "adc r10, r10, #0 \n\t"         \
1278    "umull r11, r12, r5, r7 \n\t"   \
1279    "adds r14, r14, r11 \n\t"       \
1280    "adcs r9, r9, r12 \n\t"         \
1281    "adc r10, r10, #0 \n\t"         \
1282    "umull r11, r12, r3, r6 \n\t"   \
1283    "adds r14, r14, r11 \n\t"       \
1284    "adcs r9, r9, r12 \n\t"         \
1285    "adc r10, r10, #0 \n\t"         \
1286    "ldr r11, [r0] \n\t"            \
1287    "adds r14, r14, r11 \n\t"       \
1288    "adcs r9, r9, #0 \n\t"          \
1289    "adc r10, r10, #0 \n\t"         \
1290    "stmia r0!, {r14} \n\t"         \
1291                                    \
1292    "ldmia r1!, {r4} \n\t"          \
1293    "mov r11, #0 \n\t"              \
1294    "umull r12, r14, r5, r8 \n\t"   \
1295    "adds r9, r9, r12 \n\t"         \
1296    "adcs r10, r10, r14 \n\t"       \
1297    "adc r11, r11, #0 \n\t"         \
1298    "umull r12, r14, r3, r7 \n\t"   \
1299    "adds r9, r9, r12 \n\t"         \
1300    "adcs r10, r10, r14 \n\t"       \
1301    "adc r11, r11, #0 \n\t"         \
1302    "umull r12, r14, r4, r6 \n\t"   \
1303    "adds r9, r9, r12 \n\t"         \
1304    "adcs r10, r10, r14 \n\t"       \
1305    "adc r11, r11, #0 \n\t"         \
1306    "ldr r12, [r0] \n\t"            \
1307    "adds r9, r9, r12 \n\t"         \
1308    "adcs r10, r10, #0 \n\t"        \
1309    "adc r11, r11, #0 \n\t"         \
1310    "stmia r0!, {r9} \n\t"          \
1311                                    \
1312    "ldmia r2!, {r6} \n\t"          \
1313    "mov r12, #0 \n\t"              \
1314    "umull r14, r9, r5, r6 \n\t"    \
1315    "adds r10, r10, r14 \n\t"       \
1316    "adcs r11, r11, r9 \n\t"        \
1317    "adc r12, r12, #0 \n\t"         \
1318    "umull r14, r9, r3, r8 \n\t"    \
1319    "adds r10, r10, r14 \n\t"       \
1320    "adcs r11, r11, r9 \n\t"        \
1321    "adc r12, r12, #0 \n\t"         \
1322    "umull r14, r9, r4, r7 \n\t"    \
1323    "adds r10, r10, r14 \n\t"       \
1324    "adcs r11, r11, r9 \n\t"        \
1325    "adc r12, r12, #0 \n\t"         \
1326    "ldr r14, [r0] \n\t"            \
1327    "adds r10, r10, r14 \n\t"       \
1328    "adcs r11, r11, #0 \n\t"        \
1329    "adc r12, r12, #0 \n\t"         \
1330    "stmia r0!, {r10} \n\t"         \
1331                                    \
1332    "ldmia r2!, {r7} \n\t"          \
1333    "mov r14, #0 \n\t"              \
1334    "umull r9, r10, r5, r7 \n\t"    \
1335    "adds r11, r11, r9 \n\t"        \
1336    "adcs r12, r12, r10 \n\t"       \
1337    "adc r14, r14, #0 \n\t"         \
1338    "umull r9, r10, r3, r6 \n\t"    \
1339    "adds r11, r11, r9 \n\t"        \
1340    "adcs r12, r12, r10 \n\t"       \
1341    "adc r14, r14, #0 \n\t"         \
1342    "umull r9, r10, r4, r8 \n\t"    \
1343    "adds r11, r11, r9 \n\t"        \
1344    "adcs r12, r12, r10 \n\t"       \
1345    "adc r14, r14, #0 \n\t"         \
1346    "ldr r9, [r0] \n\t"             \
1347    "adds r11, r11, r9 \n\t"        \
1348    "adcs r12, r12, #0 \n\t"        \
1349    "adc r14, r14, #0 \n\t"         \
1350    "stmia r0!, {r11} \n\t"         \
1351                                    \
1352    "ldmia r2!, {r8} \n\t"          \
1353    "mov r9, #0 \n\t"               \
1354    "umull r10, r11, r5, r8 \n\t"   \
1355    "adds r12, r12, r10 \n\t"       \
1356    "adcs r14, r14, r11 \n\t"       \
1357    "adc r9, r9, #0 \n\t"           \
1358    "umull r10, r11, r3, r7 \n\t"   \
1359    "adds r12, r12, r10 \n\t"       \
1360    "adcs r14, r14, r11 \n\t"       \
1361    "adc r9, r9, #0 \n\t"           \
1362    "umull r10, r11, r4, r6 \n\t"   \
1363    "adds r12, r12, r10 \n\t"       \
1364    "adcs r14, r14, r11 \n\t"       \
1365    "adc r9, r9, #0 \n\t"           \
1366    "ldr r10, [r0] \n\t"            \
1367    "adds r12, r12, r10 \n\t"       \
1368    "adcs r14, r14, #0 \n\t"        \
1369    "adc r9, r9, #0 \n\t"           \
1370    "stmia r0!, {r12} \n\t"         \
1371                                    \
1372    "ldmia r2!, {r6} \n\t"          \
1373    "mov r10, #0 \n\t"              \
1374    "umull r11, r12, r5, r6 \n\t"   \
1375    "adds r14, r14, r11 \n\t"       \
1376    "adcs r9, r9, r12 \n\t"         \
1377    "adc r10, r10, #0 \n\t"         \
1378    "umull r11, r12, r3, r8 \n\t"   \
1379    "adds r14, r14, r11 \n\t"       \
1380    "adcs r9, r9, r12 \n\t"         \
1381    "adc r10, r10, #0 \n\t"         \
1382    "umull r11, r12, r4, r7 \n\t"   \
1383    "adds r14, r14, r11 \n\t"       \
1384    "adcs r9, r9, r12 \n\t"         \
1385    "adc r10, r10, #0 \n\t"         \
1386    "ldr r11, [r0] \n\t"            \
1387    "adds r14, r14, r11 \n\t"       \
1388    "adcs r9, r9, #0 \n\t"          \
1389    "adc r10, r10, #0 \n\t"         \
1390    "stmia r0!, {r14} \n\t"         \
1391                                    \
1392    "ldmia r2!, {r7} \n\t"          \
1393    "mov r11, #0 \n\t"              \
1394    "umull r12, r14, r5, r7 \n\t"   \
1395    "adds r9, r9, r12 \n\t"         \
1396    "adcs r10, r10, r14 \n\t"       \
1397    "adc r11, r11, #0 \n\t"         \
1398    "umull r12, r14, r3, r6 \n\t"   \
1399    "adds r9, r9, r12 \n\t"         \
1400    "adcs r10, r10, r14 \n\t"       \
1401    "adc r11, r11, #0 \n\t"         \
1402    "umull r12, r14, r4, r8 \n\t"   \
1403    "adds r9, r9, r12 \n\t"         \
1404    "adcs r10, r10, r14 \n\t"       \
1405    "adc r11, r11, #0 \n\t"         \
1406    "ldr r12, [r0] \n\t"            \
1407    "adds r9, r9, r12 \n\t"         \
1408    "adcs r10, r10, #0 \n\t"        \
1409    "adc r11, r11, #0 \n\t"         \
1410    "stmia r0!, {r9} \n\t"          \
1411                                    \
1412    "mov r12, #0 \n\t"              \
1413    "umull r14, r9, r3, r7 \n\t"    \
1414    "adds r10, r10, r14 \n\t"       \
1415    "adcs r11, r11, r9 \n\t"        \
1416    "adc r12, r12, #0 \n\t"         \
1417    "umull r14, r9, r4, r6 \n\t"    \
1418    "adds r10, r10, r14 \n\t"       \
1419    "adcs r11, r11, r9 \n\t"        \
1420    "adc r12, r12, #0 \n\t"         \
1421    "stmia r0!, {r10} \n\t"         \
1422                                    \
1423    "umull r9, r10, r4, r7 \n\t"    \
1424    "adds r11, r11, r9 \n\t"        \
1425    "adc r12, r12, r10 \n\t"        \
1426    "stmia r0!, {r11, r12} \n\t"    \
1427    "pop {r3} \n\t"
1428
1429#define FAST_SQUARE_ASM_5               \
1430    "push   {r2} \n\t"                  \
1431    "ldmia r1!, {r2,r3,r4,r5,r6} \n\t"  \
1432    "push   {r1} \n\t"                  \
1433                                        \
1434    "umull r11, r12, r2, r2 \n\t"       \
1435    "stmia r0!, {r11} \n\t"             \
1436                                        \
1437    "mov r9, #0 \n\t"                   \
1438    "umull r10, r11, r2, r3 \n\t"       \
1439    "adds r12, r12, r10 \n\t"           \
1440    "adcs r8, r11, #0 \n\t"             \
1441    "adc r9, r9, #0 \n\t"               \
1442    "adds r12, r12, r10 \n\t"           \
1443    "adcs r8, r8, r11 \n\t"             \
1444    "adc r9, r9, #0 \n\t"               \
1445    "stmia r0!, {r12} \n\t"             \
1446                                        \
1447    "mov r10, #0 \n\t"                  \
1448    "umull r11, r12, r2, r4 \n\t"       \
1449    "adds r11, r11, r11 \n\t"           \
1450    "adcs r12, r12, r12 \n\t"           \
1451    "adc r10, r10, #0 \n\t"             \
1452    "adds r8, r8, r11 \n\t"             \
1453    "adcs r9, r9, r12 \n\t"             \
1454    "adc r10, r10, #0 \n\t"             \
1455    "umull r11, r12, r3, r3 \n\t"       \
1456    "adds r8, r8, r11 \n\t"             \
1457    "adcs r9, r9, r12 \n\t"             \
1458    "adc r10, r10, #0 \n\t"             \
1459    "stmia r0!, {r8} \n\t"              \
1460                                        \
1461    "mov r12, #0 \n\t"                  \
1462    "umull r8, r11, r2, r5 \n\t"        \
1463    "umull r1, r14, r3, r4 \n\t"        \
1464    "adds r8, r8, r1 \n\t"              \
1465    "adcs r11, r11, r14 \n\t"           \
1466    "adc r12, r12, #0 \n\t"             \
1467    "adds r8, r8, r8 \n\t"              \
1468    "adcs r11, r11, r11 \n\t"           \
1469    "adc r12, r12, r12 \n\t"            \
1470    "adds r8, r8, r9 \n\t"              \
1471    "adcs r11, r11, r10 \n\t"           \
1472    "adc r12, r12, #0 \n\t"             \
1473    "stmia r0!, {r8} \n\t"              \
1474                                        \
1475    "mov r10, #0 \n\t"                  \
1476    "umull r8, r9, r2, r6 \n\t"         \
1477    "umull r1, r14, r3, r5 \n\t"        \
1478    "adds r8, r8, r1 \n\t"              \
1479    "adcs r9, r9, r14 \n\t"             \
1480    "adc r10, r10, #0 \n\t"             \
1481    "adds r8, r8, r8 \n\t"              \
1482    "adcs r9, r9, r9 \n\t"              \
1483    "adc r10, r10, r10 \n\t"            \
1484    "umull r1, r14, r4, r4 \n\t"        \
1485    "adds r8, r8, r1 \n\t"              \
1486    "adcs r9, r9, r14 \n\t"             \
1487    "adc r10, r10, #0 \n\t"             \
1488    "adds r8, r8, r11 \n\t"             \
1489    "adcs r9, r9, r12 \n\t"             \
1490    "adc r10, r10, #0 \n\t"             \
1491    "stmia r0!, {r8} \n\t"              \
1492                                        \
1493    "mov r12, #0 \n\t"                  \
1494    "umull r8, r11, r3, r6 \n\t"        \
1495    "umull r1, r14, r4, r5 \n\t"        \
1496    "adds r8, r8, r1 \n\t"              \
1497    "adcs r11, r11, r14 \n\t"           \
1498    "adc r12, r12, #0 \n\t"             \
1499    "adds r8, r8, r8 \n\t"              \
1500    "adcs r11, r11, r11 \n\t"           \
1501    "adc r12, r12, r12 \n\t"            \
1502    "adds r8, r8, r9 \n\t"              \
1503    "adcs r11, r11, r10 \n\t"           \
1504    "adc r12, r12, #0 \n\t"             \
1505    "stmia r0!, {r8} \n\t"              \
1506                                        \
1507    "mov r8, #0 \n\t"                   \
1508    "umull r1, r10, r4, r6 \n\t"        \
1509    "adds r1, r1, r1 \n\t"              \
1510    "adcs r10, r10, r10 \n\t"           \
1511    "adc r8, r8, #0 \n\t"               \
1512    "adds r11, r11, r1 \n\t"            \
1513    "adcs r12, r12, r10 \n\t"           \
1514    "adc r8, r8, #0 \n\t"               \
1515    "umull r1, r10, r5, r5 \n\t"        \
1516    "adds r11, r11, r1 \n\t"            \
1517    "adcs r12, r12, r10 \n\t"           \
1518    "adc r8, r8, #0 \n\t"               \
1519    "stmia r0!, {r11} \n\t"             \
1520                                        \
1521    "mov r11, #0 \n\t"                  \
1522    "umull r1, r10, r5, r6 \n\t"        \
1523    "adds r1, r1, r1 \n\t"              \
1524    "adcs r10, r10, r10 \n\t"           \
1525    "adc r11, r11, #0 \n\t"             \
1526    "adds r12, r12, r1 \n\t"            \
1527    "adcs r8, r8, r10 \n\t"             \
1528    "adc r11, r11, #0 \n\t"             \
1529    "stmia r0!, {r12} \n\t"             \
1530                                        \
1531    "umull r1, r10, r6, r6 \n\t"        \
1532    "adds r8, r8, r1 \n\t"              \
1533    "adcs r11, r11, r10 \n\t"           \
1534    "stmia r0!, {r8, r11} \n\t"         \
1535    "pop {r1, r2} \n\t"
1536
1537#define FAST_SQUARE_ASM_5_TO_6           \
1538    "cmp r2, #5 \n\t"                    \
1539    "beq 1f \n\t"                        \
1540                                         \
1541    "sub r0, #20 \n\t"                   \
1542    "sub r1, #20 \n\t"                   \
1543                                         \
1544    /* Do off-center multiplication */   \
1545    "ldmia r1!, {r6,r7,r8,r9,r10,r11} \n\t" \
1546    "umull r3, r4, r6, r11 \n\t"         \
1547    "umull r6, r5, r7, r11 \n\t"         \
1548    "adds r4, r4, r6 \n\t"               \
1549    "umull r7, r6, r8, r11 \n\t"         \
1550    "adcs r5, r5, r7 \n\t"               \
1551    "umull r8, r7, r9, r11 \n\t"         \
1552    "adcs r6, r6, r8 \n\t"               \
1553    "umull r9, r8, r10, r11 \n\t"        \
1554    "adcs r7, r7, r9 \n\t"               \
1555    "adcs r8, r8, #0 \n\t"               \
1556                                         \
1557    /* Multiply by 2 */                  \
1558    "mov r9, #0 \n\t"                    \
1559    "adds r3, r3, r3 \n\t"               \
1560    "adcs r4, r4, r4 \n\t"               \
1561    "adcs r5, r5, r5 \n\t"               \
1562    "adcs r6, r6, r6 \n\t"               \
1563    "adcs r7, r7, r7 \n\t"               \
1564    "adcs r8, r8, r8 \n\t"               \
1565    "adcs r9, r9, #0 \n\t"               \
1566                                         \
1567    /* Add into previous */              \
1568    "ldr r14, [r0], #4 \n\t"             \
1569    "adds r3, r3, r14 \n\t"              \
1570    "ldr r14, [r0], #4 \n\t"             \
1571    "adcs r4, r4, r14 \n\t"              \
1572    "ldr r14, [r0], #4 \n\t"             \
1573    "adcs r5, r5, r14 \n\t"              \
1574    "ldr r14, [r0], #4 \n\t"             \
1575    "adcs r6, r6, r14 \n\t"              \
1576    "ldr r14, [r0], #4 \n\t"             \
1577    "adcs r7, r7, r14 \n\t"              \
1578    "adcs r8, r8, #0 \n\t"               \
1579    "adcs r9, r9, #0 \n\t"               \
1580    "sub r0, #20 \n\t"                   \
1581                                         \
1582    /* Perform center multiplication */  \
1583    "umlal r8, r9, r11, r11 \n\t"        \
1584    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9} \n\t"
1585
1586#define FAST_SQUARE_ASM_6                  \
1587    "push   {r2} \n\t"                     \
1588    "ldmia r1!, {r2,r3,r4,r5,r6,r7} \n\t"  \
1589    "push   {r1} \n\t"                     \
1590                                           \
1591    "umull r11, r12, r2, r2 \n\t"          \
1592    "stmia r0!, {r11} \n\t"                \
1593                                           \
1594    "mov r9, #0 \n\t"                      \
1595    "umull r10, r11, r2, r3 \n\t"          \
1596    "adds r12, r12, r10 \n\t"              \
1597    "adcs r8, r11, #0 \n\t"                \
1598    "adc r9, r9, #0 \n\t"                  \
1599    "adds r12, r12, r10 \n\t"              \
1600    "adcs r8, r8, r11 \n\t"                \
1601    "adc r9, r9, #0 \n\t"                  \
1602    "stmia r0!, {r12} \n\t"                \
1603                                           \
1604    "mov r10, #0 \n\t"                     \
1605    "umull r11, r12, r2, r4 \n\t"          \
1606    "adds r11, r11, r11 \n\t"              \
1607    "adcs r12, r12, r12 \n\t"              \
1608    "adc r10, r10, #0 \n\t"                \
1609    "adds r8, r8, r11 \n\t"                \
1610    "adcs r9, r9, r12 \n\t"                \
1611    "adc r10, r10, #0 \n\t"                \
1612    "umull r11, r12, r3, r3 \n\t"          \
1613    "adds r8, r8, r11 \n\t"                \
1614    "adcs r9, r9, r12 \n\t"                \
1615    "adc r10, r10, #0 \n\t"                \
1616    "stmia r0!, {r8} \n\t"                 \
1617                                           \
1618    "mov r12, #0 \n\t"                     \
1619    "umull r8, r11, r2, r5 \n\t"           \
1620    "umull r1, r14, r3, r4 \n\t"           \
1621    "adds r8, r8, r1 \n\t"                 \
1622    "adcs r11, r11, r14 \n\t"              \
1623    "adc r12, r12, #0 \n\t"                \
1624    "adds r8, r8, r8 \n\t"                 \
1625    "adcs r11, r11, r11 \n\t"              \
1626    "adc r12, r12, r12 \n\t"               \
1627    "adds r8, r8, r9 \n\t"                 \
1628    "adcs r11, r11, r10 \n\t"              \
1629    "adc r12, r12, #0 \n\t"                \
1630    "stmia r0!, {r8} \n\t"                 \
1631                                           \
1632    "mov r10, #0 \n\t"                     \
1633    "umull r8, r9, r2, r6 \n\t"            \
1634    "umull r1, r14, r3, r5 \n\t"           \
1635    "adds r8, r8, r1 \n\t"                 \
1636    "adcs r9, r9, r14 \n\t"                \
1637    "adc r10, r10, #0 \n\t"                \
1638    "adds r8, r8, r8 \n\t"                 \
1639    "adcs r9, r9, r9 \n\t"                 \
1640    "adc r10, r10, r10 \n\t"               \
1641    "umull r1, r14, r4, r4 \n\t"           \
1642    "adds r8, r8, r1 \n\t"                 \
1643    "adcs r9, r9, r14 \n\t"                \
1644    "adc r10, r10, #0 \n\t"                \
1645    "adds r8, r8, r11 \n\t"                \
1646    "adcs r9, r9, r12 \n\t"                \
1647    "adc r10, r10, #0 \n\t"                \
1648    "stmia r0!, {r8} \n\t"                 \
1649                                           \
1650    "mov r12, #0 \n\t"                     \
1651    "umull r8, r11, r2, r7 \n\t"           \
1652    "umull r1, r14, r3, r6 \n\t"           \
1653    "adds r8, r8, r1 \n\t"                 \
1654    "adcs r11, r11, r14 \n\t"              \
1655    "adc r12, r12, #0 \n\t"                \
1656    "umull r1, r14, r4, r5 \n\t"           \
1657    "adds r8, r8, r1 \n\t"                 \
1658    "adcs r11, r11, r14 \n\t"              \
1659    "adc r12, r12, #0 \n\t"                \
1660    "adds r8, r8, r8 \n\t"                 \
1661    "adcs r11, r11, r11 \n\t"              \
1662    "adc r12, r12, r12 \n\t"               \
1663    "adds r8, r8, r9 \n\t"                 \
1664    "adcs r11, r11, r10 \n\t"              \
1665    "adc r12, r12, #0 \n\t"                \
1666    "stmia r0!, {r8} \n\t"                 \
1667                                           \
1668    "mov r10, #0 \n\t"                     \
1669    "umull r8, r9, r3, r7 \n\t"            \
1670    "umull r1, r14, r4, r6 \n\t"           \
1671    "adds r8, r8, r1 \n\t"                 \
1672    "adcs r9, r9, r14 \n\t"                \
1673    "adc r10, r10, #0 \n\t"                \
1674    "adds r8, r8, r8 \n\t"                 \
1675    "adcs r9, r9, r9 \n\t"                 \
1676    "adc r10, r10, r10 \n\t"               \
1677    "umull r1, r14, r5, r5 \n\t"           \
1678    "adds r8, r8, r1 \n\t"                 \
1679    "adcs r9, r9, r14 \n\t"                \
1680    "adc r10, r10, #0 \n\t"                \
1681    "adds r8, r8, r11 \n\t"                \
1682    "adcs r9, r9, r12 \n\t"                \
1683    "adc r10, r10, #0 \n\t"                \
1684    "stmia r0!, {r8} \n\t"                 \
1685                                           \
1686    "mov r12, #0 \n\t"                     \
1687    "umull r8, r11, r4, r7 \n\t"           \
1688    "umull r1, r14, r5, r6 \n\t"           \
1689    "adds r8, r8, r1 \n\t"                 \
1690    "adcs r11, r11, r14 \n\t"              \
1691    "adc r12, r12, #0 \n\t"                \
1692    "adds r8, r8, r8 \n\t"                 \
1693    "adcs r11, r11, r11 \n\t"              \
1694    "adc r12, r12, r12 \n\t"               \
1695    "adds r8, r8, r9 \n\t"                 \
1696    "adcs r11, r11, r10 \n\t"              \
1697    "adc r12, r12, #0 \n\t"                \
1698    "stmia r0!, {r8} \n\t"                 \
1699                                           \
1700    "mov r8, #0 \n\t"                      \
1701    "umull r1, r10, r5, r7 \n\t"           \
1702    "adds r1, r1, r1 \n\t"                 \
1703    "adcs r10, r10, r10 \n\t"              \
1704    "adc r8, r8, #0 \n\t"                  \
1705    "adds r11, r11, r1 \n\t"               \
1706    "adcs r12, r12, r10 \n\t"              \
1707    "adc r8, r8, #0 \n\t"                  \
1708    "umull r1, r10, r6, r6 \n\t"           \
1709    "adds r11, r11, r1 \n\t"               \
1710    "adcs r12, r12, r10 \n\t"              \
1711    "adc r8, r8, #0 \n\t"                  \
1712    "stmia r0!, {r11} \n\t"                \
1713                                           \
1714    "mov r11, #0 \n\t"                     \
1715    "umull r1, r10, r6, r7 \n\t"           \
1716    "adds r1, r1, r1 \n\t"                 \
1717    "adcs r10, r10, r10 \n\t"              \
1718    "adc r11, r11, #0 \n\t"                \
1719    "adds r12, r12, r1 \n\t"               \
1720    "adcs r8, r8, r10 \n\t"                \
1721    "adc r11, r11, #0 \n\t"                \
1722    "stmia r0!, {r12} \n\t"                \
1723                                           \
1724    "umull r1, r10, r7, r7 \n\t"           \
1725    "adds r8, r8, r1 \n\t"                 \
1726    "adcs r11, r11, r10 \n\t"              \
1727    "stmia r0!, {r8, r11} \n\t"            \
1728    "pop {r1, r2} \n\t"
1729
1730#define FAST_SQUARE_ASM_6_TO_7               \
1731    "cmp r2, #6 \n\t"                        \
1732    "beq 1f \n\t"                            \
1733                                             \
1734    "sub r0, #24 \n\t"                       \
1735    "sub r1, #24 \n\t"                       \
1736                                             \
1737    /* Do off-center multiplication */       \
1738    "ldmia r1!, {r6,r7,r8,r9,r10,r11,r12} \n\t" \
1739    "umull r3, r4, r6, r12 \n\t"             \
1740    "umull r6, r5, r7, r12 \n\t"             \
1741    "adds r4, r4, r6 \n\t"                   \
1742    "umull r7, r6, r8, r12 \n\t"             \
1743    "adcs r5, r5, r7 \n\t"                   \
1744    "umull r8, r7, r9, r12 \n\t"             \
1745    "adcs r6, r6, r8 \n\t"                   \
1746    "umull r9, r8, r10, r12 \n\t"            \
1747    "adcs r7, r7, r9 \n\t"                   \
1748    "umull r10, r9, r11, r12 \n\t"           \
1749    "adcs r8, r8, r10 \n\t"                  \
1750    "adcs r9, r9, #0 \n\t"                   \
1751                                             \
1752    /* Multiply by 2 */                      \
1753    "mov r10, #0 \n\t"                       \
1754    "adds r3, r3, r3 \n\t"                   \
1755    "adcs r4, r4, r4 \n\t"                   \
1756    "adcs r5, r5, r5 \n\t"                   \
1757    "adcs r6, r6, r6 \n\t"                   \
1758    "adcs r7, r7, r7 \n\t"                   \
1759    "adcs r8, r8, r8 \n\t"                   \
1760    "adcs r9, r9, r9 \n\t"                   \
1761    "adcs r10, r10, #0 \n\t"                 \
1762                                             \
1763    /* Add into previous */                  \
1764    "ldr r14, [r0], #4 \n\t"                 \
1765    "adds r3, r3, r14 \n\t"                  \
1766    "ldr r14, [r0], #4 \n\t"                 \
1767    "adcs r4, r4, r14 \n\t"                  \
1768    "ldr r14, [r0], #4 \n\t"                 \
1769    "adcs r5, r5, r14 \n\t"                  \
1770    "ldr r14, [r0], #4 \n\t"                 \
1771    "adcs r6, r6, r14 \n\t"                  \
1772    "ldr r14, [r0], #4 \n\t"                 \
1773    "adcs r7, r7, r14 \n\t"                  \
1774    "ldr r14, [r0], #4 \n\t"                 \
1775    "adcs r8, r8, r14 \n\t"                  \
1776    "adcs r9, r9, #0 \n\t"                   \
1777    "adcs r10, r10, #0 \n\t"                 \
1778    "sub r0, #24 \n\t"                       \
1779                                             \
1780    /* Perform center multiplication */      \
1781    "umlal r9, r10, r12, r12 \n\t"           \
1782    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10} \n\t"
1783
1784#define FAST_SQUARE_ASM_7                          \
1785    "push   {r2} \n\t"                             \
1786    "ldmia r1!, {r2, r3, r4, r5, r6, r7, r8} \n\t" \
1787    "push   {r1} \n\t"                             \
1788    "sub r1, 4 \n\t"                               \
1789                                                   \
1790    "add r0, 24 \n\t"                              \
1791    "umull r9, r10, r2, r8 \n\t"                   \
1792    "stmia r0!, {r9, r10} \n\t"                    \
1793    "sub r0, 32 \n\t"                              \
1794                                                   \
1795    "umull r11, r12, r2, r2 \n\t"                  \
1796    "stmia r0!, {r11} \n\t"                        \
1797                                                   \
1798    "mov r9, #0 \n\t"                              \
1799    "umull r10, r11, r2, r3 \n\t"                  \
1800    "adds r12, r12, r10 \n\t"                      \
1801    "adcs r8, r11, #0 \n\t"                        \
1802    "adc r9, r9, #0 \n\t"                          \
1803    "adds r12, r12, r10 \n\t"                      \
1804    "adcs r8, r8, r11 \n\t"                        \
1805    "adc r9, r9, #0 \n\t"                          \
1806    "stmia r0!, {r12} \n\t"                        \
1807                                                   \
1808    "mov r10, #0 \n\t"                             \
1809    "umull r11, r12, r2, r4 \n\t"                  \
1810    "adds r11, r11, r11 \n\t"                      \
1811    "adcs r12, r12, r12 \n\t"                      \
1812    "adc r10, r10, #0 \n\t"                        \
1813    "adds r8, r8, r11 \n\t"                        \
1814    "adcs r9, r9, r12 \n\t"                        \
1815    "adc r10, r10, #0 \n\t"                        \
1816    "umull r11, r12, r3, r3 \n\t"                  \
1817    "adds r8, r8, r11 \n\t"                        \
1818    "adcs r9, r9, r12 \n\t"                        \
1819    "adc r10, r10, #0 \n\t"                        \
1820    "stmia r0!, {r8} \n\t"                         \
1821                                                   \
1822    "mov r12, #0 \n\t"                             \
1823    "umull r8, r11, r2, r5 \n\t"                   \
1824    "mov r14, r11 \n\t"                            \
1825    "umlal r8, r11, r3, r4 \n\t"                   \
1826    "cmp r14, r11 \n\t"                            \
1827    "it hi \n\t"                                   \
1828    "adchi r12, r12, #0 \n\t"                      \
1829    "adds r8, r8, r8 \n\t"                         \
1830    "adcs r11, r11, r11 \n\t"                      \
1831    "adc r12, r12, r12 \n\t"                       \
1832    "adds r8, r8, r9 \n\t"                         \
1833    "adcs r11, r11, r10 \n\t"                      \
1834    "adc r12, r12, #0 \n\t"                        \
1835    "stmia r0!, {r8} \n\t"                         \
1836                                                   \
1837    "mov r10, #0 \n\t"                             \
1838    "umull r8, r9, r2, r6 \n\t"                    \
1839    "mov r14, r9 \n\t"                             \
1840    "umlal r8, r9, r3, r5 \n\t"                    \
1841    "cmp r14, r9 \n\t"                             \
1842    "it hi \n\t"                                   \
1843    "adchi r10, r10, #0 \n\t"                      \
1844    "adds r8, r8, r8 \n\t"                         \
1845    "adcs r9, r9, r9 \n\t"                         \
1846    "adc r10, r10, r10 \n\t"                       \
1847    "mov r14, r9 \n\t"                             \
1848    "umlal r8, r9, r4, r4 \n\t"                    \
1849    "cmp r14, r9 \n\t"                             \
1850    "it hi \n\t"                                   \
1851    "adchi r10, r10, #0 \n\t"                      \
1852    "adds r8, r8, r11 \n\t"                        \
1853    "adcs r9, r9, r12 \n\t"                        \
1854    "adc r10, r10, #0 \n\t"                        \
1855    "stmia r0!, {r8} \n\t"                         \
1856                                                   \
1857    "mov r12, #0 \n\t"                             \
1858    "umull r8, r11, r2, r7 \n\t"                   \
1859    "mov r14, r11 \n\t"                            \
1860    "umlal r8, r11, r3, r6 \n\t"                   \
1861    "cmp r14, r11 \n\t"                            \
1862    "it hi \n\t"                                   \
1863    "adchi r12, r12, #0 \n\t"                      \
1864    "mov r14, r11 \n\t"                            \
1865    "umlal r8, r11, r4, r5 \n\t"                   \
1866    "cmp r14, r11 \n\t"                            \
1867    "it hi \n\t"                                   \
1868    "adchi r12, r12, #0 \n\t"                      \
1869    "adds r8, r8, r8 \n\t"                         \
1870    "adcs r11, r11, r11 \n\t"                      \
1871    "adc r12, r12, r12 \n\t"                       \
1872    "adds r8, r8, r9 \n\t"                         \
1873    "adcs r11, r11, r10 \n\t"                      \
1874    "adc r12, r12, #0 \n\t"                        \
1875    "stmia r0!, {r8} \n\t"                         \
1876                                                   \
1877    "ldmia r1!, {r2} \n\t"                         \
1878    "mov r10, #0 \n\t"                             \
1879    "umull r8, r9, r3, r7 \n\t"                    \
1880    "mov r14, r9 \n\t"                             \
1881    "umlal r8, r9, r4, r6 \n\t"                    \
1882    "cmp r14, r9 \n\t"                             \
1883    "it hi \n\t"                                   \
1884    "adchi r10, r10, #0 \n\t"                      \
1885    "ldr r14, [r0] \n\t"                           \
1886    "adds r8, r8, r14 \n\t"                        \
1887    "adcs r9, r9, #0 \n\t"                         \
1888    "adc r10, r10, #0 \n\t"                        \
1889    "adds r8, r8, r8 \n\t"                         \
1890    "adcs r9, r9, r9 \n\t"                         \
1891    "adc r10, r10, r10 \n\t"                       \
1892    "mov r14, r9 \n\t"                             \
1893    "umlal r8, r9, r5, r5 \n\t"                    \
1894    "cmp r14, r9 \n\t"                             \
1895    "it hi \n\t"                                   \
1896    "adchi r10, r10, #0 \n\t"                      \
1897    "adds r8, r8, r11 \n\t"                        \
1898    "adcs r9, r9, r12 \n\t"                        \
1899    "adc r10, r10, #0 \n\t"                        \
1900    "stmia r0!, {r8} \n\t"                         \
1901                                                   \
1902    "mov r12, #0 \n\t"                             \
1903    "umull r8, r11, r3, r2 \n\t"                   \
1904    "mov r14, r11 \n\t"                            \
1905    "umlal r8, r11, r4, r7 \n\t"                   \
1906    "cmp r14, r11 \n\t"                            \
1907    "it hi \n\t"                                   \
1908    "adchi r12, r12, #0 \n\t"                      \
1909    "mov r14, r11 \n\t"                            \
1910    "umlal r8, r11, r5, r6 \n\t"                   \
1911    "cmp r14, r11 \n\t"                            \
1912    "it hi \n\t"                                   \
1913    "adchi r12, r12, #0 \n\t"                      \
1914    "ldr r14, [r0] \n\t"                           \
1915    "adds r8, r8, r14 \n\t"                        \
1916    "adcs r11, r11, #0 \n\t"                       \
1917    "adc r12, r12, #0 \n\t"                        \
1918    "adds r8, r8, r8 \n\t"                         \
1919    "adcs r11, r11, r11 \n\t"                      \
1920    "adc r12, r12, r12 \n\t"                       \
1921    "adds r8, r8, r9 \n\t"                         \
1922    "adcs r11, r11, r10 \n\t"                      \
1923    "adc r12, r12, #0 \n\t"                        \
1924    "stmia r0!, {r8} \n\t"                         \
1925                                                   \
1926    "mov r10, #0 \n\t"                             \
1927    "umull r8, r9, r4, r2 \n\t"                    \
1928    "mov r14, r9 \n\t"                             \
1929    "umlal r8, r9, r5, r7 \n\t"                    \
1930    "cmp r14, r9 \n\t"                             \
1931    "it hi \n\t"                                   \
1932    "adchi r10, r10, #0 \n\t"                      \
1933    "adds r8, r8, r8 \n\t"                         \
1934    "adcs r9, r9, r9 \n\t"                         \
1935    "adc r10, r10, r10 \n\t"                       \
1936    "mov r14, r9 \n\t"                             \
1937    "umlal r8, r9, r6, r6 \n\t"                    \
1938    "cmp r14, r9 \n\t"                             \
1939    "it hi \n\t"                                   \
1940    "adchi r10, r10, #0 \n\t"                      \
1941    "adds r8, r8, r11 \n\t"                        \
1942    "adcs r9, r9, r12 \n\t"                        \
1943    "adc r10, r10, #0 \n\t"                        \
1944    "stmia r0!, {r8} \n\t"                         \
1945                                                   \
1946    "mov r12, #0 \n\t"                             \
1947    "umull r8, r11, r5, r2 \n\t"                   \
1948    "mov r14, r11 \n\t"                            \
1949    "umlal r8, r11, r6, r7 \n\t"                   \
1950    "cmp r14, r11 \n\t"                            \
1951    "it hi \n\t"                                   \
1952    "adchi r12, r12, #0 \n\t"                      \
1953    "adds r8, r8, r8 \n\t"                         \
1954    "adcs r11, r11, r11 \n\t"                      \
1955    "adc r12, r12, r12 \n\t"                       \
1956    "adds r8, r8, r9 \n\t"                         \
1957    "adcs r11, r11, r10 \n\t"                      \
1958    "adc r12, r12, #0 \n\t"                        \
1959    "stmia r0!, {r8} \n\t"                         \
1960                                                   \
1961    "mov r8, #0 \n\t"                              \
1962    "umull r1, r10, r6, r2 \n\t"                   \
1963    "adds r1, r1, r1 \n\t"                         \
1964    "adcs r10, r10, r10 \n\t"                      \
1965    "adc r8, r8, #0 \n\t"                          \
1966    "adds r11, r11, r1 \n\t"                       \
1967    "adcs r12, r12, r10 \n\t"                      \
1968    "adc r8, r8, #0 \n\t"                          \
1969    "umull r1, r10, r7, r7 \n\t"                   \
1970    "adds r11, r11, r1 \n\t"                       \
1971    "adcs r12, r12, r10 \n\t"                      \
1972    "adc r8, r8, #0 \n\t"                          \
1973    "stmia r0!, {r11} \n\t"                        \
1974                                                   \
1975    "mov r11, #0 \n\t"                             \
1976    "umull r1, r10, r7, r2 \n\t"                   \
1977    "adds r1, r1, r1 \n\t"                         \
1978    "adcs r10, r10, r10 \n\t"                      \
1979    "adc r11, r11, #0 \n\t"                        \
1980    "adds r12, r12, r1 \n\t"                       \
1981    "adcs r8, r8, r10 \n\t"                        \
1982    "adc r11, r11, #0 \n\t"                        \
1983    "stmia r0!, {r12} \n\t"                        \
1984                                                   \
1985    "umull r1, r10, r2, r2 \n\t"                   \
1986    "adds r8, r8, r1 \n\t"                         \
1987    "adcs r11, r11, r10 \n\t"                      \
1988    "stmia r0!, {r8, r11} \n\t"                    \
1989    "pop {r1, r2} \n\t"
1990
1991#define FAST_SQUARE_ASM_7_TO_8           \
1992    "cmp r2, #7 \n\t"                    \
1993    "beq 1f \n\t"                        \
1994                                         \
1995    "sub r0, #28 \n\t"                   \
1996    "sub r1, #28 \n\t"                   \
1997                                         \
1998    /* Do off-center multiplication */   \
1999    "ldmia r1!, {r6,r7,r8,r9,r10,r11,r12,r14} \n\t" \
2000    "umull r3, r4, r6, r14 \n\t"         \
2001    "umull r6, r5, r7, r14 \n\t"         \
2002    "adds r4, r4, r6 \n\t"               \
2003    "umull r7, r6, r8, r14 \n\t"         \
2004    "adcs r5, r5, r7 \n\t"               \
2005    "umull r8, r7, r9, r14 \n\t"         \
2006    "adcs r6, r6, r8 \n\t"               \
2007    "umull r9, r8, r10, r14 \n\t"        \
2008    "adcs r7, r7, r9 \n\t"               \
2009    "umull r10, r9, r11, r14 \n\t"       \
2010    "adcs r8, r8, r10 \n\t"              \
2011    "umull r11, r10, r12, r14 \n\t"      \
2012    "adcs r9, r9, r11 \n\t"              \
2013    "adcs r10, r10, #0 \n\t"             \
2014                                         \
2015    /* Multiply by 2 */                  \
2016    "mov r11, #0 \n\t"                   \
2017    "adds r3, r3, r3 \n\t"               \
2018    "adcs r4, r4, r4 \n\t"               \
2019    "adcs r5, r5, r5 \n\t"               \
2020    "adcs r6, r6, r6 \n\t"               \
2021    "adcs r7, r7, r7 \n\t"               \
2022    "adcs r8, r8, r8 \n\t"               \
2023    "adcs r9, r9, r9 \n\t"               \
2024    "adcs r10, r10, r10 \n\t"            \
2025    "adcs r11, r11, #0 \n\t"             \
2026                                         \
2027    /* Add into previous */              \
2028    "ldr r12, [r0], #4 \n\t"             \
2029    "adds r3, r3, r12 \n\t"              \
2030    "ldr r12, [r0], #4 \n\t"             \
2031    "adcs r4, r4, r12 \n\t"              \
2032    "ldr r12, [r0], #4 \n\t"             \
2033    "adcs r5, r5, r12 \n\t"              \
2034    "ldr r12, [r0], #4 \n\t"             \
2035    "adcs r6, r6, r12 \n\t"              \
2036    "ldr r12, [r0], #4 \n\t"             \
2037    "adcs r7, r7, r12 \n\t"              \
2038    "ldr r12, [r0], #4 \n\t"             \
2039    "adcs r8, r8, r12 \n\t"              \
2040    "ldr r12, [r0], #4 \n\t"             \
2041    "adcs r9, r9, r12 \n\t"              \
2042    "adcs r10, r10, #0 \n\t"             \
2043    "adcs r11, r11, #0 \n\t"             \
2044    "sub r0, #28 \n\t"                   \
2045                                         \
2046    /* Perform center multiplication */  \
2047    "umlal r10, r11, r14, r14 \n\t"      \
2048    "stmia r0!, {r3,r4,r5,r6,r7,r8,r9,r10,r11} \n\t"
2049
2050#define FAST_SQUARE_ASM_8                   \
2051    "push   {r2} \n\t"                      \
2052    "ldmia r1!, {r2,r3,r4,r5,r6,r7,r8,r9} \n\t" \
2053    "push   {r1} \n\t"                      \
2054    "sub r1, 8 \n\t"                        \
2055                                            \
2056    "add r0, 24 \n\t"                       \
2057    "umull r10, r11, r2, r8 \n\t"           \
2058    "umull r12, r14, r2, r9 \n\t"           \
2059    "umull r8, r9, r3, r9 \n\t"             \
2060    "adds r11, r11, r12 \n\t"               \
2061    "adcs r12, r14, r8 \n\t"                \
2062    "adcs r14, r9, #0 \n\t"                 \
2063    "stmia r0!, {r10, r11, r12, r14} \n\t"  \
2064    "sub r0, 40 \n\t"                       \
2065                                            \
2066    "umull r11, r12, r2, r2 \n\t"           \
2067    "stmia r0!, {r11} \n\t"                 \
2068                                            \
2069    "mov r9, #0 \n\t"                       \
2070    "umull r10, r11, r2, r3 \n\t"           \
2071    "adds r12, r12, r10 \n\t"               \
2072    "adcs r8, r11, #0 \n\t"                 \
2073    "adc r9, r9, #0 \n\t"                   \
2074    "adds r12, r12, r10 \n\t"               \
2075    "adcs r8, r8, r11 \n\t"                 \
2076    "adc r9, r9, #0 \n\t"                   \
2077    "stmia r0!, {r12} \n\t"                 \
2078                                            \
2079    "mov r10, #0 \n\t"                      \
2080    "umull r11, r12, r2, r4 \n\t"           \
2081    "adds r11, r11, r11 \n\t"               \
2082    "adcs r12, r12, r12 \n\t"               \
2083    "adc r10, r10, #0 \n\t"                 \
2084    "adds r8, r8, r11 \n\t"                 \
2085    "adcs r9, r9, r12 \n\t"                 \
2086    "adc r10, r10, #0 \n\t"                 \
2087    "umull r11, r12, r3, r3 \n\t"           \
2088    "adds r8, r8, r11 \n\t"                 \
2089    "adcs r9, r9, r12 \n\t"                 \
2090    "adc r10, r10, #0 \n\t"                 \
2091    "stmia r0!, {r8} \n\t"                  \
2092                                            \
2093    "mov r12, #0 \n\t"                      \
2094    "umull r8, r11, r2, r5 \n\t"            \
2095    "mov r14, r11 \n\t"                     \
2096    "umlal r8, r11, r3, r4 \n\t"            \
2097    "cmp r14, r11 \n\t"                     \
2098    "it hi \n\t"                            \
2099    "adchi r12, r12, #0 \n\t"               \
2100    "adds r8, r8, r8 \n\t"                  \
2101    "adcs r11, r11, r11 \n\t"               \
2102    "adc r12, r12, r12 \n\t"                \
2103    "adds r8, r8, r9 \n\t"                  \
2104    "adcs r11, r11, r10 \n\t"               \
2105    "adc r12, r12, #0 \n\t"                 \
2106    "stmia r0!, {r8} \n\t"                  \
2107                                            \
2108    "mov r10, #0 \n\t"                      \
2109    "umull r8, r9, r2, r6 \n\t"             \
2110    "mov r14, r9 \n\t"                      \
2111    "umlal r8, r9, r3, r5 \n\t"             \
2112    "cmp r14, r9 \n\t"                      \
2113    "it hi \n\t"                            \
2114    "adchi r10, r10, #0 \n\t"               \
2115    "adds r8, r8, r8 \n\t"                  \
2116    "adcs r9, r9, r9 \n\t"                  \
2117    "adc r10, r10, r10 \n\t"                \
2118    "mov r14, r9 \n\t"                      \
2119    "umlal r8, r9, r4, r4 \n\t"             \
2120    "cmp r14, r9 \n\t"                      \
2121    "it hi \n\t"                            \
2122    "adchi r10, r10, #0 \n\t"               \
2123    "adds r8, r8, r11 \n\t"                 \
2124    "adcs r9, r9, r12 \n\t"                 \
2125    "adc r10, r10, #0 \n\t"                 \
2126    "stmia r0!, {r8} \n\t"                  \
2127                                            \
2128    "mov r12, #0 \n\t"                      \
2129    "umull r8, r11, r2, r7 \n\t"            \
2130    "mov r14, r11 \n\t"                     \
2131    "umlal r8, r11, r3, r6 \n\t"            \
2132    "cmp r14, r11 \n\t"                     \
2133    "it hi \n\t"                            \
2134    "adchi r12, r12, #0 \n\t"               \
2135    "mov r14, r11 \n\t"                     \
2136    "umlal r8, r11, r4, r5 \n\t"            \
2137    "cmp r14, r11 \n\t"                     \
2138    "it hi \n\t"                            \
2139    "adchi r12, r12, #0 \n\t"               \
2140    "adds r8, r8, r8 \n\t"                  \
2141    "adcs r11, r11, r11 \n\t"               \
2142    "adc r12, r12, r12 \n\t"                \
2143    "adds r8, r8, r9 \n\t"                  \
2144    "adcs r11, r11, r10 \n\t"               \
2145    "adc r12, r12, #0 \n\t"                 \
2146    "stmia r0!, {r8} \n\t"                  \
2147                                            \
2148    "ldmia r1!, {r2} \n\t"                  \
2149    "mov r10, #0 \n\t"                      \
2150    "umull r8, r9, r3, r7 \n\t"             \
2151    "mov r14, r9 \n\t"                      \
2152    "umlal r8, r9, r4, r6 \n\t"             \
2153    "cmp r14, r9 \n\t"                      \
2154    "it hi \n\t"                            \
2155    "adchi r10, r10, #0 \n\t"               \
2156    "ldr r14, [r0] \n\t"                    \
2157    "adds r8, r8, r14 \n\t"                 \
2158    "adcs r9, r9, #0 \n\t"                  \
2159    "adc r10, r10, #0 \n\t"                 \
2160    "adds r8, r8, r8 \n\t"                  \
2161    "adcs r9, r9, r9 \n\t"                  \
2162    "adc r10, r10, r10 \n\t"                \
2163    "mov r14, r9 \n\t"                      \
2164    "umlal r8, r9, r5, r5 \n\t"             \
2165    "cmp r14, r9 \n\t"                      \
2166    "it hi \n\t"                            \
2167    "adchi r10, r10, #0 \n\t"               \
2168    "adds r8, r8, r11 \n\t"                 \
2169    "adcs r9, r9, r12 \n\t"                 \
2170    "adc r10, r10, #0 \n\t"                 \
2171    "stmia r0!, {r8} \n\t"                  \
2172                                            \
2173    "mov r12, #0 \n\t"                      \
2174    "umull r8, r11, r3, r2 \n\t"            \
2175    "mov r14, r11 \n\t"                     \
2176    "umlal r8, r11, r4, r7 \n\t"            \
2177    "cmp r14, r11 \n\t"                     \
2178    "it hi \n\t"                            \
2179    "adchi r12, r12, #0 \n\t"               \
2180    "mov r14, r11 \n\t"                     \
2181    "umlal r8, r11, r5, r6 \n\t"            \
2182    "cmp r14, r11 \n\t"                     \
2183    "it hi \n\t"                            \
2184    "adchi r12, r12, #0 \n\t"               \
2185    "ldr r14, [r0] \n\t"                    \
2186    "adds r8, r8, r14 \n\t"                 \
2187    "adcs r11, r11, #0 \n\t"                \
2188    "adc r12, r12, #0 \n\t"                 \
2189    "adds r8, r8, r8 \n\t"                  \
2190    "adcs r11, r11, r11 \n\t"               \
2191    "adc r12, r12, r12 \n\t"                \
2192    "adds r8, r8, r9 \n\t"                  \
2193    "adcs r11, r11, r10 \n\t"               \
2194    "adc r12, r12, #0 \n\t"                 \
2195    "stmia r0!, {r8} \n\t"                  \
2196                                            \
2197    "ldmia r1!, {r3} \n\t"                  \
2198    "mov r10, #0 \n\t"                      \
2199    "umull r8, r9, r4, r2 \n\t"             \
2200    "mov r14, r9 \n\t"                      \
2201    "umlal r8, r9, r5, r7 \n\t"             \
2202    "cmp r14, r9 \n\t"                      \
2203    "it hi \n\t"                            \
2204    "adchi r10, r10, #0 \n\t"               \
2205    "ldr r14, [r0] \n\t"                    \
2206    "adds r8, r8, r14 \n\t"                 \
2207    "adcs r9, r9, #0 \n\t"                  \
2208    "adc r10, r10, #0 \n\t"                 \
2209    "adds r8, r8, r8 \n\t"                  \
2210    "adcs r9, r9, r9 \n\t"                  \
2211    "adc r10, r10, r10 \n\t"                \
2212    "mov r14, r9 \n\t"                      \
2213    "umlal r8, r9, r6, r6 \n\t"             \
2214    "cmp r14, r9 \n\t"                      \
2215    "it hi \n\t"                            \
2216    "adchi r10, r10, #0 \n\t"               \
2217    "adds r8, r8, r11 \n\t"                 \
2218    "adcs r9, r9, r12 \n\t"                 \
2219    "adc r10, r10, #0 \n\t"                 \
2220    "stmia r0!, {r8} \n\t"                  \
2221                                            \
2222    "mov r12, #0 \n\t"                      \
2223    "umull r8, r11, r4, r3 \n\t"            \
2224    "mov r14, r11 \n\t"                     \
2225    "umlal r8, r11, r5, r2 \n\t"            \
2226    "cmp r14, r11 \n\t"                     \
2227    "it hi \n\t"                            \
2228    "adchi r12, r12, #0 \n\t"               \
2229    "mov r14, r11 \n\t"                     \
2230    "umlal r8, r11, r6, r7 \n\t"            \
2231    "cmp r14, r11 \n\t"                     \
2232    "it hi \n\t"                            \
2233    "adchi r12, r12, #0 \n\t"               \
2234    "ldr r14, [r0] \n\t"                    \
2235    "adds r8, r8, r14 \n\t"                 \
2236    "adcs r11, r11, #0 \n\t"                \
2237    "adc r12, r12, #0 \n\t"                 \
2238    "adds r8, r8, r8 \n\t"                  \
2239    "adcs r11, r11, r11 \n\t"               \
2240    "adc r12, r12, r12 \n\t"                \
2241    "adds r8, r8, r9 \n\t"                  \
2242    "adcs r11, r11, r10 \n\t"               \
2243    "adc r12, r12, #0 \n\t"                 \
2244    "stmia r0!, {r8} \n\t"                  \
2245                                            \
2246    "mov r10, #0 \n\t"                      \
2247    "umull r8, r9, r5, r3 \n\t"             \
2248    "mov r14, r9 \n\t"                      \
2249    "umlal r8, r9, r6, r2 \n\t"             \
2250    "cmp r14, r9 \n\t"                      \
2251    "it hi \n\t"                            \
2252    "adchi r10, r10, #0 \n\t"               \
2253    "adds r8, r8, r8 \n\t"                  \
2254    "adcs r9, r9, r9 \n\t"                  \
2255    "adc r10, r10, r10 \n\t"                \
2256    "mov r14, r9 \n\t"                      \
2257    "umlal r8, r9, r7, r7 \n\t"             \
2258    "cmp r14, r9 \n\t"                      \
2259    "it hi \n\t"                            \
2260    "adchi r10, r10, #0 \n\t"               \
2261    "adds r8, r8, r11 \n\t"                 \
2262    "adcs r9, r9, r12 \n\t"                 \
2263    "adc r10, r10, #0 \n\t"                 \
2264    "stmia r0!, {r8} \n\t"                  \
2265                                            \
2266    "mov r12, #0 \n\t"                      \
2267    "umull r8, r11, r6, r3 \n\t"            \
2268    "mov r14, r11 \n\t"                     \
2269    "umlal r8, r11, r7, r2 \n\t"            \
2270    "cmp r14, r11 \n\t"                     \
2271    "it hi \n\t"                            \
2272    "adchi r12, r12, #0 \n\t"               \
2273    "adds r8, r8, r8 \n\t"                  \
2274    "adcs r11, r11, r11 \n\t"               \
2275    "adc r12, r12, r12 \n\t"                \
2276    "adds r8, r8, r9 \n\t"                  \
2277    "adcs r11, r11, r10 \n\t"               \
2278    "adc r12, r12, #0 \n\t"                 \
2279    "stmia r0!, {r8} \n\t"                  \
2280                                            \
2281    "mov r8, #0 \n\t"                       \
2282    "umull r1, r10, r7, r3 \n\t"            \
2283    "adds r1, r1, r1 \n\t"                  \
2284    "adcs r10, r10, r10 \n\t"               \
2285    "adc r8, r8, #0 \n\t"                   \
2286    "adds r11, r11, r1 \n\t"                \
2287    "adcs r12, r12, r10 \n\t"               \
2288    "adc r8, r8, #0 \n\t"                   \
2289    "umull r1, r10, r2, r2 \n\t"            \
2290    "adds r11, r11, r1 \n\t"                \
2291    "adcs r12, r12, r10 \n\t"               \
2292    "adc r8, r8, #0 \n\t"                   \
2293    "stmia r0!, {r11} \n\t"                 \
2294                                            \
2295    "mov r11, #0 \n\t"                      \
2296    "umull r1, r10, r2, r3 \n\t"            \
2297    "adds r1, r1, r1 \n\t"                  \
2298    "adcs r10, r10, r10 \n\t"               \
2299    "adc r11, r11, #0 \n\t"                 \
2300    "adds r12, r12, r1 \n\t"                \
2301    "adcs r8, r8, r10 \n\t"                 \
2302    "adc r11, r11, #0 \n\t"                 \
2303    "stmia r0!, {r12} \n\t"                 \
2304                                            \
2305    "umull r1, r10, r3, r3 \n\t"            \
2306    "adds r8, r8, r1 \n\t"                  \
2307    "adcs r11, r11, r10 \n\t"               \
2308    "stmia r0!, {r8, r11} \n\t"             \
2309    "pop {r1, r2} \n\t"
2310
2311#endif /* _UECC_ASM_ARM_MULT_SQUARE_H_ */
2312