1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16.text 17 18.globl gcm_init_neon 19.hidden gcm_init_neon 20.type gcm_init_neon,%function 21.align 4 22gcm_init_neon: 23 // This function is adapted from gcm_init_v8. xC2 is t3. 24 ld1 {v17.2d}, [x1] // load H 25 movi v19.16b, #0xe1 26 shl v19.2d, v19.2d, #57 // 0xc2.0 27 ext v3.16b, v17.16b, v17.16b, #8 28 ushr v18.2d, v19.2d, #63 29 dup v17.4s, v17.s[1] 30 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 31 ushr v18.2d, v3.2d, #63 32 sshr v17.4s, v17.4s, #31 // broadcast carry bit 33 and v18.16b, v18.16b, v16.16b 34 shl v3.2d, v3.2d, #1 35 ext v18.16b, v18.16b, v18.16b, #8 36 and v16.16b, v16.16b, v17.16b 37 orr v3.16b, v3.16b, v18.16b // H<<<=1 38 eor v5.16b, v3.16b, v16.16b // twisted H 39 st1 {v5.2d}, [x0] // store Htable[0] 40 ret 41.size gcm_init_neon,.-gcm_init_neon 42 43.globl gcm_gmult_neon 44.hidden gcm_gmult_neon 45.type gcm_gmult_neon,%function 46.align 4 47gcm_gmult_neon: 48 ld1 {v3.16b}, [x0] // load Xi 49 ld1 {v5.1d}, [x1], #8 // load twisted H 50 ld1 {v6.1d}, [x1] 51 adrp x9, .Lmasks // load constants 52 add x9, x9, :lo12:.Lmasks 53 ld1 {v24.2d, v25.2d}, [x9] 54 rev64 v3.16b, v3.16b // byteswap Xi 55 ext v3.16b, v3.16b, v3.16b, #8 56 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 57 58 mov x3, #16 59 b .Lgmult_neon 60.size gcm_gmult_neon,.-gcm_gmult_neon 61 62.globl gcm_ghash_neon 63.hidden gcm_ghash_neon 64.type gcm_ghash_neon,%function 65.align 4 66gcm_ghash_neon: 67 ld1 {v0.16b}, [x0] // load Xi 68 ld1 {v5.1d}, [x1], #8 // load twisted H 69 ld1 {v6.1d}, [x1] 70 adrp x9, .Lmasks // load constants 71 add x9, x9, :lo12:.Lmasks 72 ld1 {v24.2d, v25.2d}, [x9] 73 rev64 v0.16b, v0.16b // byteswap Xi 74 ext v0.16b, v0.16b, v0.16b, #8 75 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 76 77.Loop_neon: 78 ld1 {v3.16b}, [x2], #16 // load inp 79 rev64 v3.16b, v3.16b // byteswap inp 80 ext v3.16b, v3.16b, v3.16b, #8 81 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 82 83.Lgmult_neon: 84 // Split the input into v3 and v4. (The upper halves are unused, 85 // so it is okay to leave them alone.) 86 ins v4.d[0], v3.d[1] 87 ext v16.8b, v5.8b, v5.8b, #1 // A1 88 pmull v16.8h, v16.8b, v3.8b // F = A1*B 89 ext v0.8b, v3.8b, v3.8b, #1 // B1 90 pmull v0.8h, v5.8b, v0.8b // E = A*B1 91 ext v17.8b, v5.8b, v5.8b, #2 // A2 92 pmull v17.8h, v17.8b, v3.8b // H = A2*B 93 ext v19.8b, v3.8b, v3.8b, #2 // B2 94 pmull v19.8h, v5.8b, v19.8b // G = A*B2 95 ext v18.8b, v5.8b, v5.8b, #3 // A3 96 eor v16.16b, v16.16b, v0.16b // L = E + F 97 pmull v18.8h, v18.8b, v3.8b // J = A3*B 98 ext v0.8b, v3.8b, v3.8b, #3 // B3 99 eor v17.16b, v17.16b, v19.16b // M = G + H 100 pmull v0.8h, v5.8b, v0.8b // I = A*B3 101 102 // Here we diverge from the 32-bit version. It computes the following 103 // (instructions reordered for clarity): 104 // 105 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 106 // vand $t0#hi, $t0#hi, $k48 107 // veor $t0#lo, $t0#lo, $t0#hi 108 // 109 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 110 // vand $t1#hi, $t1#hi, $k32 111 // veor $t1#lo, $t1#lo, $t1#hi 112 // 113 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 114 // vand $t2#hi, $t2#hi, $k16 115 // veor $t2#lo, $t2#lo, $t2#hi 116 // 117 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 118 // vmov.i64 $t3#hi, #0 119 // 120 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 121 // upper halves of SIMD registers, so we must split each half into 122 // separate registers. To compensate, we pair computations up and 123 // parallelize. 124 125 ext v19.8b, v3.8b, v3.8b, #4 // B4 126 eor v18.16b, v18.16b, v0.16b // N = I + J 127 pmull v19.8h, v5.8b, v19.8b // K = A*B4 128 129 // This can probably be scheduled more efficiently. For now, we just 130 // pair up independent instructions. 131 zip1 v20.2d, v16.2d, v17.2d 132 zip1 v22.2d, v18.2d, v19.2d 133 zip2 v21.2d, v16.2d, v17.2d 134 zip2 v23.2d, v18.2d, v19.2d 135 eor v20.16b, v20.16b, v21.16b 136 eor v22.16b, v22.16b, v23.16b 137 and v21.16b, v21.16b, v24.16b 138 and v23.16b, v23.16b, v25.16b 139 eor v20.16b, v20.16b, v21.16b 140 eor v22.16b, v22.16b, v23.16b 141 zip1 v16.2d, v20.2d, v21.2d 142 zip1 v18.2d, v22.2d, v23.2d 143 zip2 v17.2d, v20.2d, v21.2d 144 zip2 v19.2d, v22.2d, v23.2d 145 146 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 147 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 148 pmull v0.8h, v5.8b, v3.8b // D = A*B 149 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 150 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 151 eor v16.16b, v16.16b, v17.16b 152 eor v18.16b, v18.16b, v19.16b 153 eor v0.16b, v0.16b, v16.16b 154 eor v0.16b, v0.16b, v18.16b 155 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 156 ext v16.8b, v7.8b, v7.8b, #1 // A1 157 pmull v16.8h, v16.8b, v3.8b // F = A1*B 158 ext v1.8b, v3.8b, v3.8b, #1 // B1 159 pmull v1.8h, v7.8b, v1.8b // E = A*B1 160 ext v17.8b, v7.8b, v7.8b, #2 // A2 161 pmull v17.8h, v17.8b, v3.8b // H = A2*B 162 ext v19.8b, v3.8b, v3.8b, #2 // B2 163 pmull v19.8h, v7.8b, v19.8b // G = A*B2 164 ext v18.8b, v7.8b, v7.8b, #3 // A3 165 eor v16.16b, v16.16b, v1.16b // L = E + F 166 pmull v18.8h, v18.8b, v3.8b // J = A3*B 167 ext v1.8b, v3.8b, v3.8b, #3 // B3 168 eor v17.16b, v17.16b, v19.16b // M = G + H 169 pmull v1.8h, v7.8b, v1.8b // I = A*B3 170 171 // Here we diverge from the 32-bit version. It computes the following 172 // (instructions reordered for clarity): 173 // 174 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 175 // vand $t0#hi, $t0#hi, $k48 176 // veor $t0#lo, $t0#lo, $t0#hi 177 // 178 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 179 // vand $t1#hi, $t1#hi, $k32 180 // veor $t1#lo, $t1#lo, $t1#hi 181 // 182 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 183 // vand $t2#hi, $t2#hi, $k16 184 // veor $t2#lo, $t2#lo, $t2#hi 185 // 186 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 187 // vmov.i64 $t3#hi, #0 188 // 189 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 190 // upper halves of SIMD registers, so we must split each half into 191 // separate registers. To compensate, we pair computations up and 192 // parallelize. 193 194 ext v19.8b, v3.8b, v3.8b, #4 // B4 195 eor v18.16b, v18.16b, v1.16b // N = I + J 196 pmull v19.8h, v7.8b, v19.8b // K = A*B4 197 198 // This can probably be scheduled more efficiently. For now, we just 199 // pair up independent instructions. 200 zip1 v20.2d, v16.2d, v17.2d 201 zip1 v22.2d, v18.2d, v19.2d 202 zip2 v21.2d, v16.2d, v17.2d 203 zip2 v23.2d, v18.2d, v19.2d 204 eor v20.16b, v20.16b, v21.16b 205 eor v22.16b, v22.16b, v23.16b 206 and v21.16b, v21.16b, v24.16b 207 and v23.16b, v23.16b, v25.16b 208 eor v20.16b, v20.16b, v21.16b 209 eor v22.16b, v22.16b, v23.16b 210 zip1 v16.2d, v20.2d, v21.2d 211 zip1 v18.2d, v22.2d, v23.2d 212 zip2 v17.2d, v20.2d, v21.2d 213 zip2 v19.2d, v22.2d, v23.2d 214 215 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 216 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 217 pmull v1.8h, v7.8b, v3.8b // D = A*B 218 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 219 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 220 eor v16.16b, v16.16b, v17.16b 221 eor v18.16b, v18.16b, v19.16b 222 eor v1.16b, v1.16b, v16.16b 223 eor v1.16b, v1.16b, v18.16b 224 ext v16.8b, v6.8b, v6.8b, #1 // A1 225 pmull v16.8h, v16.8b, v4.8b // F = A1*B 226 ext v2.8b, v4.8b, v4.8b, #1 // B1 227 pmull v2.8h, v6.8b, v2.8b // E = A*B1 228 ext v17.8b, v6.8b, v6.8b, #2 // A2 229 pmull v17.8h, v17.8b, v4.8b // H = A2*B 230 ext v19.8b, v4.8b, v4.8b, #2 // B2 231 pmull v19.8h, v6.8b, v19.8b // G = A*B2 232 ext v18.8b, v6.8b, v6.8b, #3 // A3 233 eor v16.16b, v16.16b, v2.16b // L = E + F 234 pmull v18.8h, v18.8b, v4.8b // J = A3*B 235 ext v2.8b, v4.8b, v4.8b, #3 // B3 236 eor v17.16b, v17.16b, v19.16b // M = G + H 237 pmull v2.8h, v6.8b, v2.8b // I = A*B3 238 239 // Here we diverge from the 32-bit version. It computes the following 240 // (instructions reordered for clarity): 241 // 242 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 243 // vand $t0#hi, $t0#hi, $k48 244 // veor $t0#lo, $t0#lo, $t0#hi 245 // 246 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 247 // vand $t1#hi, $t1#hi, $k32 248 // veor $t1#lo, $t1#lo, $t1#hi 249 // 250 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 251 // vand $t2#hi, $t2#hi, $k16 252 // veor $t2#lo, $t2#lo, $t2#hi 253 // 254 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 255 // vmov.i64 $t3#hi, #0 256 // 257 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 258 // upper halves of SIMD registers, so we must split each half into 259 // separate registers. To compensate, we pair computations up and 260 // parallelize. 261 262 ext v19.8b, v4.8b, v4.8b, #4 // B4 263 eor v18.16b, v18.16b, v2.16b // N = I + J 264 pmull v19.8h, v6.8b, v19.8b // K = A*B4 265 266 // This can probably be scheduled more efficiently. For now, we just 267 // pair up independent instructions. 268 zip1 v20.2d, v16.2d, v17.2d 269 zip1 v22.2d, v18.2d, v19.2d 270 zip2 v21.2d, v16.2d, v17.2d 271 zip2 v23.2d, v18.2d, v19.2d 272 eor v20.16b, v20.16b, v21.16b 273 eor v22.16b, v22.16b, v23.16b 274 and v21.16b, v21.16b, v24.16b 275 and v23.16b, v23.16b, v25.16b 276 eor v20.16b, v20.16b, v21.16b 277 eor v22.16b, v22.16b, v23.16b 278 zip1 v16.2d, v20.2d, v21.2d 279 zip1 v18.2d, v22.2d, v23.2d 280 zip2 v17.2d, v20.2d, v21.2d 281 zip2 v19.2d, v22.2d, v23.2d 282 283 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 284 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 285 pmull v2.8h, v6.8b, v4.8b // D = A*B 286 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 287 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 288 eor v16.16b, v16.16b, v17.16b 289 eor v18.16b, v18.16b, v19.16b 290 eor v2.16b, v2.16b, v16.16b 291 eor v2.16b, v2.16b, v18.16b 292 ext v16.16b, v0.16b, v2.16b, #8 293 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 294 eor v1.16b, v1.16b, v2.16b 295 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 296 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 297 // This is a no-op due to the ins instruction below. 298 // ins v2.d[0], v1.d[1] 299 300 // equivalent of reduction_avx from ghash-x86_64.pl 301 shl v17.2d, v0.2d, #57 // 1st phase 302 shl v18.2d, v0.2d, #62 303 eor v18.16b, v18.16b, v17.16b // 304 shl v17.2d, v0.2d, #63 305 eor v18.16b, v18.16b, v17.16b // 306 // Note Xm contains {Xl.d[1], Xh.d[0]}. 307 eor v18.16b, v18.16b, v1.16b 308 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 309 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 310 311 ushr v18.2d, v0.2d, #1 // 2nd phase 312 eor v2.16b, v2.16b,v0.16b 313 eor v0.16b, v0.16b,v18.16b // 314 ushr v18.2d, v18.2d, #6 315 ushr v0.2d, v0.2d, #1 // 316 eor v0.16b, v0.16b, v2.16b // 317 eor v0.16b, v0.16b, v18.16b // 318 319 subs x3, x3, #16 320 bne .Loop_neon 321 322 rev64 v0.16b, v0.16b // byteswap Xi and write 323 ext v0.16b, v0.16b, v0.16b, #8 324 st1 {v0.16b}, [x0] 325 326 ret 327.size gcm_ghash_neon,.-gcm_ghash_neon 328 329.section .rodata 330.align 4 331.Lmasks: 332.quad 0x0000ffffffffffff // k48 333.quad 0x00000000ffffffff // k32 334.quad 0x000000000000ffff // k16 335.quad 0x0000000000000000 // k0 336.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 337.align 2 338.align 2 339#endif 340#endif // !OPENSSL_NO_ASM 341.section .note.GNU-stack,"",%progbits 342