1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(_WIN32) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16 17.text 18 19.globl gcm_init_neon 20 21.def gcm_init_neon 22 .type 32 23.endef 24.align 4 25gcm_init_neon: 26 AARCH64_VALID_CALL_TARGET 27 // This function is adapted from gcm_init_v8. xC2 is t3. 28 ld1 {v17.2d}, [x1] // load H 29 movi v19.16b, #0xe1 30 shl v19.2d, v19.2d, #57 // 0xc2.0 31 ext v3.16b, v17.16b, v17.16b, #8 32 ushr v18.2d, v19.2d, #63 33 dup v17.4s, v17.s[1] 34 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 35 ushr v18.2d, v3.2d, #63 36 sshr v17.4s, v17.4s, #31 // broadcast carry bit 37 and v18.16b, v18.16b, v16.16b 38 shl v3.2d, v3.2d, #1 39 ext v18.16b, v18.16b, v18.16b, #8 40 and v16.16b, v16.16b, v17.16b 41 orr v3.16b, v3.16b, v18.16b // H<<<=1 42 eor v5.16b, v3.16b, v16.16b // twisted H 43 st1 {v5.2d}, [x0] // store Htable[0] 44 ret 45 46 47.globl gcm_gmult_neon 48 49.def gcm_gmult_neon 50 .type 32 51.endef 52.align 4 53gcm_gmult_neon: 54 AARCH64_VALID_CALL_TARGET 55 ld1 {v3.16b}, [x0] // load Xi 56 ld1 {v5.1d}, [x1], #8 // load twisted H 57 ld1 {v6.1d}, [x1] 58 adrp x9, Lmasks // load constants 59 add x9, x9, :lo12:Lmasks 60 ld1 {v24.2d, v25.2d}, [x9] 61 rev64 v3.16b, v3.16b // byteswap Xi 62 ext v3.16b, v3.16b, v3.16b, #8 63 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 64 65 mov x3, #16 66 b Lgmult_neon 67 68 69.globl gcm_ghash_neon 70 71.def gcm_ghash_neon 72 .type 32 73.endef 74.align 4 75gcm_ghash_neon: 76 AARCH64_VALID_CALL_TARGET 77 ld1 {v0.16b}, [x0] // load Xi 78 ld1 {v5.1d}, [x1], #8 // load twisted H 79 ld1 {v6.1d}, [x1] 80 adrp x9, Lmasks // load constants 81 add x9, x9, :lo12:Lmasks 82 ld1 {v24.2d, v25.2d}, [x9] 83 rev64 v0.16b, v0.16b // byteswap Xi 84 ext v0.16b, v0.16b, v0.16b, #8 85 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 86 87Loop_neon: 88 ld1 {v3.16b}, [x2], #16 // load inp 89 rev64 v3.16b, v3.16b // byteswap inp 90 ext v3.16b, v3.16b, v3.16b, #8 91 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 92 93Lgmult_neon: 94 // Split the input into v3 and v4. (The upper halves are unused, 95 // so it is okay to leave them alone.) 96 ins v4.d[0], v3.d[1] 97 ext v16.8b, v5.8b, v5.8b, #1 // A1 98 pmull v16.8h, v16.8b, v3.8b // F = A1*B 99 ext v0.8b, v3.8b, v3.8b, #1 // B1 100 pmull v0.8h, v5.8b, v0.8b // E = A*B1 101 ext v17.8b, v5.8b, v5.8b, #2 // A2 102 pmull v17.8h, v17.8b, v3.8b // H = A2*B 103 ext v19.8b, v3.8b, v3.8b, #2 // B2 104 pmull v19.8h, v5.8b, v19.8b // G = A*B2 105 ext v18.8b, v5.8b, v5.8b, #3 // A3 106 eor v16.16b, v16.16b, v0.16b // L = E + F 107 pmull v18.8h, v18.8b, v3.8b // J = A3*B 108 ext v0.8b, v3.8b, v3.8b, #3 // B3 109 eor v17.16b, v17.16b, v19.16b // M = G + H 110 pmull v0.8h, v5.8b, v0.8b // I = A*B3 111 112 // Here we diverge from the 32-bit version. It computes the following 113 // (instructions reordered for clarity): 114 // 115 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 116 // vand $t0#hi, $t0#hi, $k48 117 // veor $t0#lo, $t0#lo, $t0#hi 118 // 119 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 120 // vand $t1#hi, $t1#hi, $k32 121 // veor $t1#lo, $t1#lo, $t1#hi 122 // 123 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 124 // vand $t2#hi, $t2#hi, $k16 125 // veor $t2#lo, $t2#lo, $t2#hi 126 // 127 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 128 // vmov.i64 $t3#hi, #0 129 // 130 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 131 // upper halves of SIMD registers, so we must split each half into 132 // separate registers. To compensate, we pair computations up and 133 // parallelize. 134 135 ext v19.8b, v3.8b, v3.8b, #4 // B4 136 eor v18.16b, v18.16b, v0.16b // N = I + J 137 pmull v19.8h, v5.8b, v19.8b // K = A*B4 138 139 // This can probably be scheduled more efficiently. For now, we just 140 // pair up independent instructions. 141 zip1 v20.2d, v16.2d, v17.2d 142 zip1 v22.2d, v18.2d, v19.2d 143 zip2 v21.2d, v16.2d, v17.2d 144 zip2 v23.2d, v18.2d, v19.2d 145 eor v20.16b, v20.16b, v21.16b 146 eor v22.16b, v22.16b, v23.16b 147 and v21.16b, v21.16b, v24.16b 148 and v23.16b, v23.16b, v25.16b 149 eor v20.16b, v20.16b, v21.16b 150 eor v22.16b, v22.16b, v23.16b 151 zip1 v16.2d, v20.2d, v21.2d 152 zip1 v18.2d, v22.2d, v23.2d 153 zip2 v17.2d, v20.2d, v21.2d 154 zip2 v19.2d, v22.2d, v23.2d 155 156 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 157 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 158 pmull v0.8h, v5.8b, v3.8b // D = A*B 159 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 160 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 161 eor v16.16b, v16.16b, v17.16b 162 eor v18.16b, v18.16b, v19.16b 163 eor v0.16b, v0.16b, v16.16b 164 eor v0.16b, v0.16b, v18.16b 165 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 166 ext v16.8b, v7.8b, v7.8b, #1 // A1 167 pmull v16.8h, v16.8b, v3.8b // F = A1*B 168 ext v1.8b, v3.8b, v3.8b, #1 // B1 169 pmull v1.8h, v7.8b, v1.8b // E = A*B1 170 ext v17.8b, v7.8b, v7.8b, #2 // A2 171 pmull v17.8h, v17.8b, v3.8b // H = A2*B 172 ext v19.8b, v3.8b, v3.8b, #2 // B2 173 pmull v19.8h, v7.8b, v19.8b // G = A*B2 174 ext v18.8b, v7.8b, v7.8b, #3 // A3 175 eor v16.16b, v16.16b, v1.16b // L = E + F 176 pmull v18.8h, v18.8b, v3.8b // J = A3*B 177 ext v1.8b, v3.8b, v3.8b, #3 // B3 178 eor v17.16b, v17.16b, v19.16b // M = G + H 179 pmull v1.8h, v7.8b, v1.8b // I = A*B3 180 181 // Here we diverge from the 32-bit version. It computes the following 182 // (instructions reordered for clarity): 183 // 184 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 185 // vand $t0#hi, $t0#hi, $k48 186 // veor $t0#lo, $t0#lo, $t0#hi 187 // 188 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 189 // vand $t1#hi, $t1#hi, $k32 190 // veor $t1#lo, $t1#lo, $t1#hi 191 // 192 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 193 // vand $t2#hi, $t2#hi, $k16 194 // veor $t2#lo, $t2#lo, $t2#hi 195 // 196 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 197 // vmov.i64 $t3#hi, #0 198 // 199 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 200 // upper halves of SIMD registers, so we must split each half into 201 // separate registers. To compensate, we pair computations up and 202 // parallelize. 203 204 ext v19.8b, v3.8b, v3.8b, #4 // B4 205 eor v18.16b, v18.16b, v1.16b // N = I + J 206 pmull v19.8h, v7.8b, v19.8b // K = A*B4 207 208 // This can probably be scheduled more efficiently. For now, we just 209 // pair up independent instructions. 210 zip1 v20.2d, v16.2d, v17.2d 211 zip1 v22.2d, v18.2d, v19.2d 212 zip2 v21.2d, v16.2d, v17.2d 213 zip2 v23.2d, v18.2d, v19.2d 214 eor v20.16b, v20.16b, v21.16b 215 eor v22.16b, v22.16b, v23.16b 216 and v21.16b, v21.16b, v24.16b 217 and v23.16b, v23.16b, v25.16b 218 eor v20.16b, v20.16b, v21.16b 219 eor v22.16b, v22.16b, v23.16b 220 zip1 v16.2d, v20.2d, v21.2d 221 zip1 v18.2d, v22.2d, v23.2d 222 zip2 v17.2d, v20.2d, v21.2d 223 zip2 v19.2d, v22.2d, v23.2d 224 225 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 226 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 227 pmull v1.8h, v7.8b, v3.8b // D = A*B 228 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 229 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 230 eor v16.16b, v16.16b, v17.16b 231 eor v18.16b, v18.16b, v19.16b 232 eor v1.16b, v1.16b, v16.16b 233 eor v1.16b, v1.16b, v18.16b 234 ext v16.8b, v6.8b, v6.8b, #1 // A1 235 pmull v16.8h, v16.8b, v4.8b // F = A1*B 236 ext v2.8b, v4.8b, v4.8b, #1 // B1 237 pmull v2.8h, v6.8b, v2.8b // E = A*B1 238 ext v17.8b, v6.8b, v6.8b, #2 // A2 239 pmull v17.8h, v17.8b, v4.8b // H = A2*B 240 ext v19.8b, v4.8b, v4.8b, #2 // B2 241 pmull v19.8h, v6.8b, v19.8b // G = A*B2 242 ext v18.8b, v6.8b, v6.8b, #3 // A3 243 eor v16.16b, v16.16b, v2.16b // L = E + F 244 pmull v18.8h, v18.8b, v4.8b // J = A3*B 245 ext v2.8b, v4.8b, v4.8b, #3 // B3 246 eor v17.16b, v17.16b, v19.16b // M = G + H 247 pmull v2.8h, v6.8b, v2.8b // I = A*B3 248 249 // Here we diverge from the 32-bit version. It computes the following 250 // (instructions reordered for clarity): 251 // 252 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 253 // vand $t0#hi, $t0#hi, $k48 254 // veor $t0#lo, $t0#lo, $t0#hi 255 // 256 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 257 // vand $t1#hi, $t1#hi, $k32 258 // veor $t1#lo, $t1#lo, $t1#hi 259 // 260 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 261 // vand $t2#hi, $t2#hi, $k16 262 // veor $t2#lo, $t2#lo, $t2#hi 263 // 264 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 265 // vmov.i64 $t3#hi, #0 266 // 267 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 268 // upper halves of SIMD registers, so we must split each half into 269 // separate registers. To compensate, we pair computations up and 270 // parallelize. 271 272 ext v19.8b, v4.8b, v4.8b, #4 // B4 273 eor v18.16b, v18.16b, v2.16b // N = I + J 274 pmull v19.8h, v6.8b, v19.8b // K = A*B4 275 276 // This can probably be scheduled more efficiently. For now, we just 277 // pair up independent instructions. 278 zip1 v20.2d, v16.2d, v17.2d 279 zip1 v22.2d, v18.2d, v19.2d 280 zip2 v21.2d, v16.2d, v17.2d 281 zip2 v23.2d, v18.2d, v19.2d 282 eor v20.16b, v20.16b, v21.16b 283 eor v22.16b, v22.16b, v23.16b 284 and v21.16b, v21.16b, v24.16b 285 and v23.16b, v23.16b, v25.16b 286 eor v20.16b, v20.16b, v21.16b 287 eor v22.16b, v22.16b, v23.16b 288 zip1 v16.2d, v20.2d, v21.2d 289 zip1 v18.2d, v22.2d, v23.2d 290 zip2 v17.2d, v20.2d, v21.2d 291 zip2 v19.2d, v22.2d, v23.2d 292 293 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 294 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 295 pmull v2.8h, v6.8b, v4.8b // D = A*B 296 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 297 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 298 eor v16.16b, v16.16b, v17.16b 299 eor v18.16b, v18.16b, v19.16b 300 eor v2.16b, v2.16b, v16.16b 301 eor v2.16b, v2.16b, v18.16b 302 ext v16.16b, v0.16b, v2.16b, #8 303 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 304 eor v1.16b, v1.16b, v2.16b 305 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 306 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 307 // This is a no-op due to the ins instruction below. 308 // ins v2.d[0], v1.d[1] 309 310 // equivalent of reduction_avx from ghash-x86_64.pl 311 shl v17.2d, v0.2d, #57 // 1st phase 312 shl v18.2d, v0.2d, #62 313 eor v18.16b, v18.16b, v17.16b // 314 shl v17.2d, v0.2d, #63 315 eor v18.16b, v18.16b, v17.16b // 316 // Note Xm contains {Xl.d[1], Xh.d[0]}. 317 eor v18.16b, v18.16b, v1.16b 318 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 319 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 320 321 ushr v18.2d, v0.2d, #1 // 2nd phase 322 eor v2.16b, v2.16b,v0.16b 323 eor v0.16b, v0.16b,v18.16b // 324 ushr v18.2d, v18.2d, #6 325 ushr v0.2d, v0.2d, #1 // 326 eor v0.16b, v0.16b, v2.16b // 327 eor v0.16b, v0.16b, v18.16b // 328 329 subs x3, x3, #16 330 bne Loop_neon 331 332 rev64 v0.16b, v0.16b // byteswap Xi and write 333 ext v0.16b, v0.16b, v0.16b, #8 334 st1 {v0.16b}, [x0] 335 336 ret 337 338 339.section .rodata 340.align 4 341Lmasks: 342.quad 0x0000ffffffffffff // k48 343.quad 0x00000000ffffffff // k32 344.quad 0x000000000000ffff // k16 345.quad 0x0000000000000000 // k0 346.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 347.align 2 348.align 2 349#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(_WIN32) 350#if defined(__ELF__) 351// See https://www.airs.com/blog/archives/518. 352.section .note.GNU-stack,"",%progbits 353#endif 354