1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.text 19 20.globl gcm_init_neon 21.hidden gcm_init_neon 22.type gcm_init_neon,%function 23.align 4 24gcm_init_neon: 25 AARCH64_VALID_CALL_TARGET 26 // This function is adapted from gcm_init_v8. xC2 is t3. 27 ld1 {v17.2d}, [x1] // load H 28 movi v19.16b, #0xe1 29 shl v19.2d, v19.2d, #57 // 0xc2.0 30 ext v3.16b, v17.16b, v17.16b, #8 31 ushr v18.2d, v19.2d, #63 32 dup v17.4s, v17.s[1] 33 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 34 ushr v18.2d, v3.2d, #63 35 sshr v17.4s, v17.4s, #31 // broadcast carry bit 36 and v18.16b, v18.16b, v16.16b 37 shl v3.2d, v3.2d, #1 38 ext v18.16b, v18.16b, v18.16b, #8 39 and v16.16b, v16.16b, v17.16b 40 orr v3.16b, v3.16b, v18.16b // H<<<=1 41 eor v5.16b, v3.16b, v16.16b // twisted H 42 st1 {v5.2d}, [x0] // store Htable[0] 43 ret 44.size gcm_init_neon,.-gcm_init_neon 45 46.globl gcm_gmult_neon 47.hidden gcm_gmult_neon 48.type gcm_gmult_neon,%function 49.align 4 50gcm_gmult_neon: 51 AARCH64_VALID_CALL_TARGET 52 ld1 {v3.16b}, [x0] // load Xi 53 ld1 {v5.1d}, [x1], #8 // load twisted H 54 ld1 {v6.1d}, [x1] 55 adrp x9, .Lmasks // load constants 56 add x9, x9, :lo12:.Lmasks 57 ld1 {v24.2d, v25.2d}, [x9] 58 rev64 v3.16b, v3.16b // byteswap Xi 59 ext v3.16b, v3.16b, v3.16b, #8 60 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 61 62 mov x3, #16 63 b .Lgmult_neon 64.size gcm_gmult_neon,.-gcm_gmult_neon 65 66.globl gcm_ghash_neon 67.hidden gcm_ghash_neon 68.type gcm_ghash_neon,%function 69.align 4 70gcm_ghash_neon: 71 AARCH64_VALID_CALL_TARGET 72 ld1 {v0.16b}, [x0] // load Xi 73 ld1 {v5.1d}, [x1], #8 // load twisted H 74 ld1 {v6.1d}, [x1] 75 adrp x9, .Lmasks // load constants 76 add x9, x9, :lo12:.Lmasks 77 ld1 {v24.2d, v25.2d}, [x9] 78 rev64 v0.16b, v0.16b // byteswap Xi 79 ext v0.16b, v0.16b, v0.16b, #8 80 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 81 82.Loop_neon: 83 ld1 {v3.16b}, [x2], #16 // load inp 84 rev64 v3.16b, v3.16b // byteswap inp 85 ext v3.16b, v3.16b, v3.16b, #8 86 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 87 88.Lgmult_neon: 89 // Split the input into v3 and v4. (The upper halves are unused, 90 // so it is okay to leave them alone.) 91 ins v4.d[0], v3.d[1] 92 ext v16.8b, v5.8b, v5.8b, #1 // A1 93 pmull v16.8h, v16.8b, v3.8b // F = A1*B 94 ext v0.8b, v3.8b, v3.8b, #1 // B1 95 pmull v0.8h, v5.8b, v0.8b // E = A*B1 96 ext v17.8b, v5.8b, v5.8b, #2 // A2 97 pmull v17.8h, v17.8b, v3.8b // H = A2*B 98 ext v19.8b, v3.8b, v3.8b, #2 // B2 99 pmull v19.8h, v5.8b, v19.8b // G = A*B2 100 ext v18.8b, v5.8b, v5.8b, #3 // A3 101 eor v16.16b, v16.16b, v0.16b // L = E + F 102 pmull v18.8h, v18.8b, v3.8b // J = A3*B 103 ext v0.8b, v3.8b, v3.8b, #3 // B3 104 eor v17.16b, v17.16b, v19.16b // M = G + H 105 pmull v0.8h, v5.8b, v0.8b // I = A*B3 106 107 // Here we diverge from the 32-bit version. It computes the following 108 // (instructions reordered for clarity): 109 // 110 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 111 // vand $t0#hi, $t0#hi, $k48 112 // veor $t0#lo, $t0#lo, $t0#hi 113 // 114 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 115 // vand $t1#hi, $t1#hi, $k32 116 // veor $t1#lo, $t1#lo, $t1#hi 117 // 118 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 119 // vand $t2#hi, $t2#hi, $k16 120 // veor $t2#lo, $t2#lo, $t2#hi 121 // 122 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 123 // vmov.i64 $t3#hi, #0 124 // 125 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 126 // upper halves of SIMD registers, so we must split each half into 127 // separate registers. To compensate, we pair computations up and 128 // parallelize. 129 130 ext v19.8b, v3.8b, v3.8b, #4 // B4 131 eor v18.16b, v18.16b, v0.16b // N = I + J 132 pmull v19.8h, v5.8b, v19.8b // K = A*B4 133 134 // This can probably be scheduled more efficiently. For now, we just 135 // pair up independent instructions. 136 zip1 v20.2d, v16.2d, v17.2d 137 zip1 v22.2d, v18.2d, v19.2d 138 zip2 v21.2d, v16.2d, v17.2d 139 zip2 v23.2d, v18.2d, v19.2d 140 eor v20.16b, v20.16b, v21.16b 141 eor v22.16b, v22.16b, v23.16b 142 and v21.16b, v21.16b, v24.16b 143 and v23.16b, v23.16b, v25.16b 144 eor v20.16b, v20.16b, v21.16b 145 eor v22.16b, v22.16b, v23.16b 146 zip1 v16.2d, v20.2d, v21.2d 147 zip1 v18.2d, v22.2d, v23.2d 148 zip2 v17.2d, v20.2d, v21.2d 149 zip2 v19.2d, v22.2d, v23.2d 150 151 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 152 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 153 pmull v0.8h, v5.8b, v3.8b // D = A*B 154 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 155 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 156 eor v16.16b, v16.16b, v17.16b 157 eor v18.16b, v18.16b, v19.16b 158 eor v0.16b, v0.16b, v16.16b 159 eor v0.16b, v0.16b, v18.16b 160 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 161 ext v16.8b, v7.8b, v7.8b, #1 // A1 162 pmull v16.8h, v16.8b, v3.8b // F = A1*B 163 ext v1.8b, v3.8b, v3.8b, #1 // B1 164 pmull v1.8h, v7.8b, v1.8b // E = A*B1 165 ext v17.8b, v7.8b, v7.8b, #2 // A2 166 pmull v17.8h, v17.8b, v3.8b // H = A2*B 167 ext v19.8b, v3.8b, v3.8b, #2 // B2 168 pmull v19.8h, v7.8b, v19.8b // G = A*B2 169 ext v18.8b, v7.8b, v7.8b, #3 // A3 170 eor v16.16b, v16.16b, v1.16b // L = E + F 171 pmull v18.8h, v18.8b, v3.8b // J = A3*B 172 ext v1.8b, v3.8b, v3.8b, #3 // B3 173 eor v17.16b, v17.16b, v19.16b // M = G + H 174 pmull v1.8h, v7.8b, v1.8b // I = A*B3 175 176 // Here we diverge from the 32-bit version. It computes the following 177 // (instructions reordered for clarity): 178 // 179 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 180 // vand $t0#hi, $t0#hi, $k48 181 // veor $t0#lo, $t0#lo, $t0#hi 182 // 183 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 184 // vand $t1#hi, $t1#hi, $k32 185 // veor $t1#lo, $t1#lo, $t1#hi 186 // 187 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 188 // vand $t2#hi, $t2#hi, $k16 189 // veor $t2#lo, $t2#lo, $t2#hi 190 // 191 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 192 // vmov.i64 $t3#hi, #0 193 // 194 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 195 // upper halves of SIMD registers, so we must split each half into 196 // separate registers. To compensate, we pair computations up and 197 // parallelize. 198 199 ext v19.8b, v3.8b, v3.8b, #4 // B4 200 eor v18.16b, v18.16b, v1.16b // N = I + J 201 pmull v19.8h, v7.8b, v19.8b // K = A*B4 202 203 // This can probably be scheduled more efficiently. For now, we just 204 // pair up independent instructions. 205 zip1 v20.2d, v16.2d, v17.2d 206 zip1 v22.2d, v18.2d, v19.2d 207 zip2 v21.2d, v16.2d, v17.2d 208 zip2 v23.2d, v18.2d, v19.2d 209 eor v20.16b, v20.16b, v21.16b 210 eor v22.16b, v22.16b, v23.16b 211 and v21.16b, v21.16b, v24.16b 212 and v23.16b, v23.16b, v25.16b 213 eor v20.16b, v20.16b, v21.16b 214 eor v22.16b, v22.16b, v23.16b 215 zip1 v16.2d, v20.2d, v21.2d 216 zip1 v18.2d, v22.2d, v23.2d 217 zip2 v17.2d, v20.2d, v21.2d 218 zip2 v19.2d, v22.2d, v23.2d 219 220 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 221 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 222 pmull v1.8h, v7.8b, v3.8b // D = A*B 223 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 224 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 225 eor v16.16b, v16.16b, v17.16b 226 eor v18.16b, v18.16b, v19.16b 227 eor v1.16b, v1.16b, v16.16b 228 eor v1.16b, v1.16b, v18.16b 229 ext v16.8b, v6.8b, v6.8b, #1 // A1 230 pmull v16.8h, v16.8b, v4.8b // F = A1*B 231 ext v2.8b, v4.8b, v4.8b, #1 // B1 232 pmull v2.8h, v6.8b, v2.8b // E = A*B1 233 ext v17.8b, v6.8b, v6.8b, #2 // A2 234 pmull v17.8h, v17.8b, v4.8b // H = A2*B 235 ext v19.8b, v4.8b, v4.8b, #2 // B2 236 pmull v19.8h, v6.8b, v19.8b // G = A*B2 237 ext v18.8b, v6.8b, v6.8b, #3 // A3 238 eor v16.16b, v16.16b, v2.16b // L = E + F 239 pmull v18.8h, v18.8b, v4.8b // J = A3*B 240 ext v2.8b, v4.8b, v4.8b, #3 // B3 241 eor v17.16b, v17.16b, v19.16b // M = G + H 242 pmull v2.8h, v6.8b, v2.8b // I = A*B3 243 244 // Here we diverge from the 32-bit version. It computes the following 245 // (instructions reordered for clarity): 246 // 247 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 248 // vand $t0#hi, $t0#hi, $k48 249 // veor $t0#lo, $t0#lo, $t0#hi 250 // 251 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 252 // vand $t1#hi, $t1#hi, $k32 253 // veor $t1#lo, $t1#lo, $t1#hi 254 // 255 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 256 // vand $t2#hi, $t2#hi, $k16 257 // veor $t2#lo, $t2#lo, $t2#hi 258 // 259 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 260 // vmov.i64 $t3#hi, #0 261 // 262 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 263 // upper halves of SIMD registers, so we must split each half into 264 // separate registers. To compensate, we pair computations up and 265 // parallelize. 266 267 ext v19.8b, v4.8b, v4.8b, #4 // B4 268 eor v18.16b, v18.16b, v2.16b // N = I + J 269 pmull v19.8h, v6.8b, v19.8b // K = A*B4 270 271 // This can probably be scheduled more efficiently. For now, we just 272 // pair up independent instructions. 273 zip1 v20.2d, v16.2d, v17.2d 274 zip1 v22.2d, v18.2d, v19.2d 275 zip2 v21.2d, v16.2d, v17.2d 276 zip2 v23.2d, v18.2d, v19.2d 277 eor v20.16b, v20.16b, v21.16b 278 eor v22.16b, v22.16b, v23.16b 279 and v21.16b, v21.16b, v24.16b 280 and v23.16b, v23.16b, v25.16b 281 eor v20.16b, v20.16b, v21.16b 282 eor v22.16b, v22.16b, v23.16b 283 zip1 v16.2d, v20.2d, v21.2d 284 zip1 v18.2d, v22.2d, v23.2d 285 zip2 v17.2d, v20.2d, v21.2d 286 zip2 v19.2d, v22.2d, v23.2d 287 288 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 289 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 290 pmull v2.8h, v6.8b, v4.8b // D = A*B 291 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 292 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 293 eor v16.16b, v16.16b, v17.16b 294 eor v18.16b, v18.16b, v19.16b 295 eor v2.16b, v2.16b, v16.16b 296 eor v2.16b, v2.16b, v18.16b 297 ext v16.16b, v0.16b, v2.16b, #8 298 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 299 eor v1.16b, v1.16b, v2.16b 300 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 301 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 302 // This is a no-op due to the ins instruction below. 303 // ins v2.d[0], v1.d[1] 304 305 // equivalent of reduction_avx from ghash-x86_64.pl 306 shl v17.2d, v0.2d, #57 // 1st phase 307 shl v18.2d, v0.2d, #62 308 eor v18.16b, v18.16b, v17.16b // 309 shl v17.2d, v0.2d, #63 310 eor v18.16b, v18.16b, v17.16b // 311 // Note Xm contains {Xl.d[1], Xh.d[0]}. 312 eor v18.16b, v18.16b, v1.16b 313 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 314 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 315 316 ushr v18.2d, v0.2d, #1 // 2nd phase 317 eor v2.16b, v2.16b,v0.16b 318 eor v0.16b, v0.16b,v18.16b // 319 ushr v18.2d, v18.2d, #6 320 ushr v0.2d, v0.2d, #1 // 321 eor v0.16b, v0.16b, v2.16b // 322 eor v0.16b, v0.16b, v18.16b // 323 324 subs x3, x3, #16 325 bne .Loop_neon 326 327 rev64 v0.16b, v0.16b // byteswap Xi and write 328 ext v0.16b, v0.16b, v0.16b, #8 329 st1 {v0.16b}, [x0] 330 331 ret 332.size gcm_ghash_neon,.-gcm_ghash_neon 333 334.section .rodata 335.align 4 336.Lmasks: 337.quad 0x0000ffffffffffff // k48 338.quad 0x00000000ffffffff // k32 339.quad 0x000000000000ffff // k16 340.quad 0x0000000000000000 // k0 341.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 342.align 2 343.align 2 344#endif 345#endif // !OPENSSL_NO_ASM 346.section .note.GNU-stack,"",%progbits 347