1#include "arm_arch.h" 2 3#if defined(__thumb2__) 4.syntax unified 5.thumb 6#else 7.code 32 8#endif 9 10.text 11.type mul_1x1_ialu,%function 12.align 5 13mul_1x1_ialu: 14 mov r4,#0 15 bic r5,r1,#3<<30 @ a1=a&0x3fffffff 16 str r4,[sp,#0] @ tab[0]=0 17 add r6,r5,r5 @ a2=a1<<1 18 str r5,[sp,#4] @ tab[1]=a1 19 eor r7,r5,r6 @ a1^a2 20 str r6,[sp,#8] @ tab[2]=a2 21 mov r8,r5,lsl#2 @ a4=a1<<2 22 str r7,[sp,#12] @ tab[3]=a1^a2 23 eor r9,r5,r8 @ a1^a4 24 str r8,[sp,#16] @ tab[4]=a4 25 eor r4,r6,r8 @ a2^a4 26 str r9,[sp,#20] @ tab[5]=a1^a4 27 eor r7,r7,r8 @ a1^a2^a4 28 str r4,[sp,#24] @ tab[6]=a2^a4 29 and r8,r12,r0,lsl#2 30 str r7,[sp,#28] @ tab[7]=a1^a2^a4 31 32 and r9,r12,r0,lsr#1 33 ldr r5,[sp,r8] @ tab[b & 0x7] 34 and r8,r12,r0,lsr#4 35 ldr r7,[sp,r9] @ tab[b >> 3 & 0x7] 36 and r9,r12,r0,lsr#7 37 ldr r6,[sp,r8] @ tab[b >> 6 & 0x7] 38 eor r5,r5,r7,lsl#3 @ stall 39 mov r4,r7,lsr#29 40 ldr r7,[sp,r9] @ tab[b >> 9 & 0x7] 41 42 and r8,r12,r0,lsr#10 43 eor r5,r5,r6,lsl#6 44 eor r4,r4,r6,lsr#26 45 ldr r6,[sp,r8] @ tab[b >> 12 & 0x7] 46 47 and r9,r12,r0,lsr#13 48 eor r5,r5,r7,lsl#9 49 eor r4,r4,r7,lsr#23 50 ldr r7,[sp,r9] @ tab[b >> 15 & 0x7] 51 52 and r8,r12,r0,lsr#16 53 eor r5,r5,r6,lsl#12 54 eor r4,r4,r6,lsr#20 55 ldr r6,[sp,r8] @ tab[b >> 18 & 0x7] 56 57 and r9,r12,r0,lsr#19 58 eor r5,r5,r7,lsl#15 59 eor r4,r4,r7,lsr#17 60 ldr r7,[sp,r9] @ tab[b >> 21 & 0x7] 61 62 and r8,r12,r0,lsr#22 63 eor r5,r5,r6,lsl#18 64 eor r4,r4,r6,lsr#14 65 ldr r6,[sp,r8] @ tab[b >> 24 & 0x7] 66 67 and r9,r12,r0,lsr#25 68 eor r5,r5,r7,lsl#21 69 eor r4,r4,r7,lsr#11 70 ldr r7,[sp,r9] @ tab[b >> 27 & 0x7] 71 72 tst r1,#1<<30 73 and r8,r12,r0,lsr#28 74 eor r5,r5,r6,lsl#24 75 eor r4,r4,r6,lsr#8 76 ldr r6,[sp,r8] @ tab[b >> 30 ] 77 78#ifdef __thumb2__ 79 itt ne 80#endif 81 eorne r5,r5,r0,lsl#30 82 eorne r4,r4,r0,lsr#2 83 tst r1,#1<<31 84 eor r5,r5,r7,lsl#27 85 eor r4,r4,r7,lsr#5 86#ifdef __thumb2__ 87 itt ne 88#endif 89 eorne r5,r5,r0,lsl#31 90 eorne r4,r4,r0,lsr#1 91 eor r5,r5,r6,lsl#30 92 eor r4,r4,r6,lsr#2 93 94 mov pc,lr 95.size mul_1x1_ialu,.-mul_1x1_ialu 96.globl bn_GF2m_mul_2x2 97.type bn_GF2m_mul_2x2,%function 98.align 5 99bn_GF2m_mul_2x2: 100#if __ARM_MAX_ARCH__>=7 101 stmdb sp!,{r10,lr} 102 ldr r12,.LOPENSSL_armcap 103# if !defined(_WIN32) 104 adr r10,.LOPENSSL_armcap 105 ldr r12,[r12,r10] 106# endif 107# if defined(__APPLE__) || defined(_WIN32) 108 ldr r12,[r12] 109# endif 110 tst r12,#ARMV7_NEON 111 itt ne 112 ldrne r10,[sp],#8 113 bne .LNEON 114 stmdb sp!,{r4,r5,r6,r7,r8,r9} 115#else 116 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} 117#endif 118 mov r10,r0 @ reassign 1st argument 119 mov r0,r3 @ r0=b1 120 sub r7,sp,#36 121 mov r8,sp 122 and r7,r7,#-32 123 ldr r3,[sp,#32] @ load b0 124 mov r12,#7<<2 125 mov sp,r7 @ allocate tab[8] 126 str r8,[r7,#32] 127 128 bl mul_1x1_ialu @ a1·b1 129 str r5,[r10,#8] 130 str r4,[r10,#12] 131 132 eor r0,r0,r3 @ flip b0 and b1 133 eor r1,r1,r2 @ flip a0 and a1 134 eor r3,r3,r0 135 eor r2,r2,r1 136 eor r0,r0,r3 137 eor r1,r1,r2 138 bl mul_1x1_ialu @ a0·b0 139 str r5,[r10] 140 str r4,[r10,#4] 141 142 eor r1,r1,r2 143 eor r0,r0,r3 144 bl mul_1x1_ialu @ (a1+a0)·(b1+b0) 145 ldmia r10,{r6,r7,r8,r9} 146 eor r5,r5,r4 147 ldr sp,[sp,#32] @ destroy tab[8] 148 eor r4,r4,r7 149 eor r5,r5,r6 150 eor r4,r4,r8 151 eor r5,r5,r9 152 eor r4,r4,r9 153 str r4,[r10,#8] 154 eor r5,r5,r4 155 str r5,[r10,#4] 156 157#if __ARM_ARCH__>=5 158 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} 159#else 160 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,lr} 161 tst lr,#1 162 moveq pc,lr @ be binary compatible with V4, yet 163.word 0xe12fff1e @ interoperable with Thumb ISA:-) 164#endif 165#if __ARM_MAX_ARCH__>=7 166.arch armv7-a 167.fpu neon 168 169.align 5 170.LNEON: 171 ldr r12, [sp] @ 5th argument 172 vmov d26, r2, r1 173 vmov d27, r12, r3 174 vmov.i64 d28, #0x0000ffffffffffff 175 vmov.i64 d29, #0x00000000ffffffff 176 vmov.i64 d30, #0x000000000000ffff 177 178 vext.8 d2, d26, d26, #1 @ A1 179 vmull.p8 q1, d2, d27 @ F = A1*B 180 vext.8 d0, d27, d27, #1 @ B1 181 vmull.p8 q0, d26, d0 @ E = A*B1 182 vext.8 d4, d26, d26, #2 @ A2 183 vmull.p8 q2, d4, d27 @ H = A2*B 184 vext.8 d16, d27, d27, #2 @ B2 185 vmull.p8 q8, d26, d16 @ G = A*B2 186 vext.8 d6, d26, d26, #3 @ A3 187 veor q1, q1, q0 @ L = E + F 188 vmull.p8 q3, d6, d27 @ J = A3*B 189 vext.8 d0, d27, d27, #3 @ B3 190 veor q2, q2, q8 @ M = G + H 191 vmull.p8 q0, d26, d0 @ I = A*B3 192 veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8 193 vand d3, d3, d28 194 vext.8 d16, d27, d27, #4 @ B4 195 veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16 196 vand d5, d5, d29 197 vmull.p8 q8, d26, d16 @ K = A*B4 198 veor q3, q3, q0 @ N = I + J 199 veor d2, d2, d3 200 veor d4, d4, d5 201 veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24 202 vand d7, d7, d30 203 vext.8 q1, q1, q1, #15 204 veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32 205 vmov.i64 d17, #0 206 vext.8 q2, q2, q2, #14 207 veor d6, d6, d7 208 vmull.p8 q0, d26, d27 @ D = A*B 209 vext.8 q8, q8, q8, #12 210 vext.8 q3, q3, q3, #13 211 veor q1, q1, q2 212 veor q3, q3, q8 213 veor q0, q0, q1 214 veor q0, q0, q3 215 216 vst1.32 {q0}, [r0] 217 bx lr @ bx lr 218#endif 219.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 220#if __ARM_MAX_ARCH__>=7 221.align 5 222.LOPENSSL_armcap: 223# ifdef _WIN32 224.word OPENSSL_armcap_P 225# else 226.word OPENSSL_armcap_P-. 227# endif 228#endif 229.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 230.align 2 231.align 5 232 233#if __ARM_MAX_ARCH__>=7 234.comm OPENSSL_armcap_P,4,4 235#endif 236