1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if !defined(OPENSSL_NO_ASM) 11#if defined(BORINGSSL_PREFIX) 12#include <boringssl_prefix_symbols_asm.h> 13#endif 14.text 15 16.globl _gcm_init_neon 17.private_extern _gcm_init_neon 18 19.align 4 20_gcm_init_neon: 21 // This function is adapted from gcm_init_v8. xC2 is t3. 22 ld1 {v17.2d}, [x1] // load H 23 movi v19.16b, #0xe1 24 shl v19.2d, v19.2d, #57 // 0xc2.0 25 ext v3.16b, v17.16b, v17.16b, #8 26 ushr v18.2d, v19.2d, #63 27 dup v17.4s, v17.s[1] 28 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 29 ushr v18.2d, v3.2d, #63 30 sshr v17.4s, v17.4s, #31 // broadcast carry bit 31 and v18.16b, v18.16b, v16.16b 32 shl v3.2d, v3.2d, #1 33 ext v18.16b, v18.16b, v18.16b, #8 34 and v16.16b, v16.16b, v17.16b 35 orr v3.16b, v3.16b, v18.16b // H<<<=1 36 eor v5.16b, v3.16b, v16.16b // twisted H 37 st1 {v5.2d}, [x0] // store Htable[0] 38 ret 39 40 41.globl _gcm_gmult_neon 42.private_extern _gcm_gmult_neon 43 44.align 4 45_gcm_gmult_neon: 46 ld1 {v3.16b}, [x0] // load Xi 47 ld1 {v5.1d}, [x1], #8 // load twisted H 48 ld1 {v6.1d}, [x1] 49 adrp x9, Lmasks@PAGE // load constants 50 add x9, x9, Lmasks@PAGEOFF 51 ld1 {v24.2d, v25.2d}, [x9] 52 rev64 v3.16b, v3.16b // byteswap Xi 53 ext v3.16b, v3.16b, v3.16b, #8 54 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 55 56 mov x3, #16 57 b Lgmult_neon 58 59 60.globl _gcm_ghash_neon 61.private_extern _gcm_ghash_neon 62 63.align 4 64_gcm_ghash_neon: 65 ld1 {v0.16b}, [x0] // load Xi 66 ld1 {v5.1d}, [x1], #8 // load twisted H 67 ld1 {v6.1d}, [x1] 68 adrp x9, Lmasks@PAGE // load constants 69 add x9, x9, Lmasks@PAGEOFF 70 ld1 {v24.2d, v25.2d}, [x9] 71 rev64 v0.16b, v0.16b // byteswap Xi 72 ext v0.16b, v0.16b, v0.16b, #8 73 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 74 75Loop_neon: 76 ld1 {v3.16b}, [x2], #16 // load inp 77 rev64 v3.16b, v3.16b // byteswap inp 78 ext v3.16b, v3.16b, v3.16b, #8 79 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 80 81Lgmult_neon: 82 // Split the input into v3 and v4. (The upper halves are unused, 83 // so it is okay to leave them alone.) 84 ins v4.d[0], v3.d[1] 85 ext v16.8b, v5.8b, v5.8b, #1 // A1 86 pmull v16.8h, v16.8b, v3.8b // F = A1*B 87 ext v0.8b, v3.8b, v3.8b, #1 // B1 88 pmull v0.8h, v5.8b, v0.8b // E = A*B1 89 ext v17.8b, v5.8b, v5.8b, #2 // A2 90 pmull v17.8h, v17.8b, v3.8b // H = A2*B 91 ext v19.8b, v3.8b, v3.8b, #2 // B2 92 pmull v19.8h, v5.8b, v19.8b // G = A*B2 93 ext v18.8b, v5.8b, v5.8b, #3 // A3 94 eor v16.16b, v16.16b, v0.16b // L = E + F 95 pmull v18.8h, v18.8b, v3.8b // J = A3*B 96 ext v0.8b, v3.8b, v3.8b, #3 // B3 97 eor v17.16b, v17.16b, v19.16b // M = G + H 98 pmull v0.8h, v5.8b, v0.8b // I = A*B3 99 100 // Here we diverge from the 32-bit version. It computes the following 101 // (instructions reordered for clarity): 102 // 103 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 104 // vand $t0#hi, $t0#hi, $k48 105 // veor $t0#lo, $t0#lo, $t0#hi 106 // 107 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 108 // vand $t1#hi, $t1#hi, $k32 109 // veor $t1#lo, $t1#lo, $t1#hi 110 // 111 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 112 // vand $t2#hi, $t2#hi, $k16 113 // veor $t2#lo, $t2#lo, $t2#hi 114 // 115 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 116 // vmov.i64 $t3#hi, #0 117 // 118 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 119 // upper halves of SIMD registers, so we must split each half into 120 // separate registers. To compensate, we pair computations up and 121 // parallelize. 122 123 ext v19.8b, v3.8b, v3.8b, #4 // B4 124 eor v18.16b, v18.16b, v0.16b // N = I + J 125 pmull v19.8h, v5.8b, v19.8b // K = A*B4 126 127 // This can probably be scheduled more efficiently. For now, we just 128 // pair up independent instructions. 129 zip1 v20.2d, v16.2d, v17.2d 130 zip1 v22.2d, v18.2d, v19.2d 131 zip2 v21.2d, v16.2d, v17.2d 132 zip2 v23.2d, v18.2d, v19.2d 133 eor v20.16b, v20.16b, v21.16b 134 eor v22.16b, v22.16b, v23.16b 135 and v21.16b, v21.16b, v24.16b 136 and v23.16b, v23.16b, v25.16b 137 eor v20.16b, v20.16b, v21.16b 138 eor v22.16b, v22.16b, v23.16b 139 zip1 v16.2d, v20.2d, v21.2d 140 zip1 v18.2d, v22.2d, v23.2d 141 zip2 v17.2d, v20.2d, v21.2d 142 zip2 v19.2d, v22.2d, v23.2d 143 144 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 145 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 146 pmull v0.8h, v5.8b, v3.8b // D = A*B 147 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 148 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 149 eor v16.16b, v16.16b, v17.16b 150 eor v18.16b, v18.16b, v19.16b 151 eor v0.16b, v0.16b, v16.16b 152 eor v0.16b, v0.16b, v18.16b 153 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 154 ext v16.8b, v7.8b, v7.8b, #1 // A1 155 pmull v16.8h, v16.8b, v3.8b // F = A1*B 156 ext v1.8b, v3.8b, v3.8b, #1 // B1 157 pmull v1.8h, v7.8b, v1.8b // E = A*B1 158 ext v17.8b, v7.8b, v7.8b, #2 // A2 159 pmull v17.8h, v17.8b, v3.8b // H = A2*B 160 ext v19.8b, v3.8b, v3.8b, #2 // B2 161 pmull v19.8h, v7.8b, v19.8b // G = A*B2 162 ext v18.8b, v7.8b, v7.8b, #3 // A3 163 eor v16.16b, v16.16b, v1.16b // L = E + F 164 pmull v18.8h, v18.8b, v3.8b // J = A3*B 165 ext v1.8b, v3.8b, v3.8b, #3 // B3 166 eor v17.16b, v17.16b, v19.16b // M = G + H 167 pmull v1.8h, v7.8b, v1.8b // I = A*B3 168 169 // Here we diverge from the 32-bit version. It computes the following 170 // (instructions reordered for clarity): 171 // 172 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 173 // vand $t0#hi, $t0#hi, $k48 174 // veor $t0#lo, $t0#lo, $t0#hi 175 // 176 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 177 // vand $t1#hi, $t1#hi, $k32 178 // veor $t1#lo, $t1#lo, $t1#hi 179 // 180 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 181 // vand $t2#hi, $t2#hi, $k16 182 // veor $t2#lo, $t2#lo, $t2#hi 183 // 184 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 185 // vmov.i64 $t3#hi, #0 186 // 187 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 188 // upper halves of SIMD registers, so we must split each half into 189 // separate registers. To compensate, we pair computations up and 190 // parallelize. 191 192 ext v19.8b, v3.8b, v3.8b, #4 // B4 193 eor v18.16b, v18.16b, v1.16b // N = I + J 194 pmull v19.8h, v7.8b, v19.8b // K = A*B4 195 196 // This can probably be scheduled more efficiently. For now, we just 197 // pair up independent instructions. 198 zip1 v20.2d, v16.2d, v17.2d 199 zip1 v22.2d, v18.2d, v19.2d 200 zip2 v21.2d, v16.2d, v17.2d 201 zip2 v23.2d, v18.2d, v19.2d 202 eor v20.16b, v20.16b, v21.16b 203 eor v22.16b, v22.16b, v23.16b 204 and v21.16b, v21.16b, v24.16b 205 and v23.16b, v23.16b, v25.16b 206 eor v20.16b, v20.16b, v21.16b 207 eor v22.16b, v22.16b, v23.16b 208 zip1 v16.2d, v20.2d, v21.2d 209 zip1 v18.2d, v22.2d, v23.2d 210 zip2 v17.2d, v20.2d, v21.2d 211 zip2 v19.2d, v22.2d, v23.2d 212 213 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 214 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 215 pmull v1.8h, v7.8b, v3.8b // D = A*B 216 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 217 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 218 eor v16.16b, v16.16b, v17.16b 219 eor v18.16b, v18.16b, v19.16b 220 eor v1.16b, v1.16b, v16.16b 221 eor v1.16b, v1.16b, v18.16b 222 ext v16.8b, v6.8b, v6.8b, #1 // A1 223 pmull v16.8h, v16.8b, v4.8b // F = A1*B 224 ext v2.8b, v4.8b, v4.8b, #1 // B1 225 pmull v2.8h, v6.8b, v2.8b // E = A*B1 226 ext v17.8b, v6.8b, v6.8b, #2 // A2 227 pmull v17.8h, v17.8b, v4.8b // H = A2*B 228 ext v19.8b, v4.8b, v4.8b, #2 // B2 229 pmull v19.8h, v6.8b, v19.8b // G = A*B2 230 ext v18.8b, v6.8b, v6.8b, #3 // A3 231 eor v16.16b, v16.16b, v2.16b // L = E + F 232 pmull v18.8h, v18.8b, v4.8b // J = A3*B 233 ext v2.8b, v4.8b, v4.8b, #3 // B3 234 eor v17.16b, v17.16b, v19.16b // M = G + H 235 pmull v2.8h, v6.8b, v2.8b // I = A*B3 236 237 // Here we diverge from the 32-bit version. It computes the following 238 // (instructions reordered for clarity): 239 // 240 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 241 // vand $t0#hi, $t0#hi, $k48 242 // veor $t0#lo, $t0#lo, $t0#hi 243 // 244 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 245 // vand $t1#hi, $t1#hi, $k32 246 // veor $t1#lo, $t1#lo, $t1#hi 247 // 248 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 249 // vand $t2#hi, $t2#hi, $k16 250 // veor $t2#lo, $t2#lo, $t2#hi 251 // 252 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 253 // vmov.i64 $t3#hi, #0 254 // 255 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 256 // upper halves of SIMD registers, so we must split each half into 257 // separate registers. To compensate, we pair computations up and 258 // parallelize. 259 260 ext v19.8b, v4.8b, v4.8b, #4 // B4 261 eor v18.16b, v18.16b, v2.16b // N = I + J 262 pmull v19.8h, v6.8b, v19.8b // K = A*B4 263 264 // This can probably be scheduled more efficiently. For now, we just 265 // pair up independent instructions. 266 zip1 v20.2d, v16.2d, v17.2d 267 zip1 v22.2d, v18.2d, v19.2d 268 zip2 v21.2d, v16.2d, v17.2d 269 zip2 v23.2d, v18.2d, v19.2d 270 eor v20.16b, v20.16b, v21.16b 271 eor v22.16b, v22.16b, v23.16b 272 and v21.16b, v21.16b, v24.16b 273 and v23.16b, v23.16b, v25.16b 274 eor v20.16b, v20.16b, v21.16b 275 eor v22.16b, v22.16b, v23.16b 276 zip1 v16.2d, v20.2d, v21.2d 277 zip1 v18.2d, v22.2d, v23.2d 278 zip2 v17.2d, v20.2d, v21.2d 279 zip2 v19.2d, v22.2d, v23.2d 280 281 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 282 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 283 pmull v2.8h, v6.8b, v4.8b // D = A*B 284 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 285 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 286 eor v16.16b, v16.16b, v17.16b 287 eor v18.16b, v18.16b, v19.16b 288 eor v2.16b, v2.16b, v16.16b 289 eor v2.16b, v2.16b, v18.16b 290 ext v16.16b, v0.16b, v2.16b, #8 291 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 292 eor v1.16b, v1.16b, v2.16b 293 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 294 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 295 // This is a no-op due to the ins instruction below. 296 // ins v2.d[0], v1.d[1] 297 298 // equivalent of reduction_avx from ghash-x86_64.pl 299 shl v17.2d, v0.2d, #57 // 1st phase 300 shl v18.2d, v0.2d, #62 301 eor v18.16b, v18.16b, v17.16b // 302 shl v17.2d, v0.2d, #63 303 eor v18.16b, v18.16b, v17.16b // 304 // Note Xm contains {Xl.d[1], Xh.d[0]}. 305 eor v18.16b, v18.16b, v1.16b 306 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 307 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 308 309 ushr v18.2d, v0.2d, #1 // 2nd phase 310 eor v2.16b, v2.16b,v0.16b 311 eor v0.16b, v0.16b,v18.16b // 312 ushr v18.2d, v18.2d, #6 313 ushr v0.2d, v0.2d, #1 // 314 eor v0.16b, v0.16b, v2.16b // 315 eor v0.16b, v0.16b, v18.16b // 316 317 subs x3, x3, #16 318 bne Loop_neon 319 320 rev64 v0.16b, v0.16b // byteswap Xi and write 321 ext v0.16b, v0.16b, v0.16b, #8 322 st1 {v0.16b}, [x0] 323 324 ret 325 326 327.section __TEXT,__const 328.align 4 329Lmasks: 330.quad 0x0000ffffffffffff // k48 331.quad 0x00000000ffffffff // k32 332.quad 0x000000000000ffff // k16 333.quad 0x0000000000000000 // k0 334.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 335.align 2 336.align 2 337#endif // !OPENSSL_NO_ASM 338