1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.text 19 20.globl gcm_init_neon 21 22.def gcm_init_neon 23 .type 32 24.endef 25.align 4 26gcm_init_neon: 27 AARCH64_VALID_CALL_TARGET 28 // This function is adapted from gcm_init_v8. xC2 is t3. 29 ld1 {v17.2d}, [x1] // load H 30 movi v19.16b, #0xe1 31 shl v19.2d, v19.2d, #57 // 0xc2.0 32 ext v3.16b, v17.16b, v17.16b, #8 33 ushr v18.2d, v19.2d, #63 34 dup v17.4s, v17.s[1] 35 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 36 ushr v18.2d, v3.2d, #63 37 sshr v17.4s, v17.4s, #31 // broadcast carry bit 38 and v18.16b, v18.16b, v16.16b 39 shl v3.2d, v3.2d, #1 40 ext v18.16b, v18.16b, v18.16b, #8 41 and v16.16b, v16.16b, v17.16b 42 orr v3.16b, v3.16b, v18.16b // H<<<=1 43 eor v5.16b, v3.16b, v16.16b // twisted H 44 st1 {v5.2d}, [x0] // store Htable[0] 45 ret 46 47 48.globl gcm_gmult_neon 49 50.def gcm_gmult_neon 51 .type 32 52.endef 53.align 4 54gcm_gmult_neon: 55 AARCH64_VALID_CALL_TARGET 56 ld1 {v3.16b}, [x0] // load Xi 57 ld1 {v5.1d}, [x1], #8 // load twisted H 58 ld1 {v6.1d}, [x1] 59 adrp x9, Lmasks // load constants 60 add x9, x9, :lo12:Lmasks 61 ld1 {v24.2d, v25.2d}, [x9] 62 rev64 v3.16b, v3.16b // byteswap Xi 63 ext v3.16b, v3.16b, v3.16b, #8 64 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 65 66 mov x3, #16 67 b Lgmult_neon 68 69 70.globl gcm_ghash_neon 71 72.def gcm_ghash_neon 73 .type 32 74.endef 75.align 4 76gcm_ghash_neon: 77 AARCH64_VALID_CALL_TARGET 78 ld1 {v0.16b}, [x0] // load Xi 79 ld1 {v5.1d}, [x1], #8 // load twisted H 80 ld1 {v6.1d}, [x1] 81 adrp x9, Lmasks // load constants 82 add x9, x9, :lo12:Lmasks 83 ld1 {v24.2d, v25.2d}, [x9] 84 rev64 v0.16b, v0.16b // byteswap Xi 85 ext v0.16b, v0.16b, v0.16b, #8 86 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 87 88Loop_neon: 89 ld1 {v3.16b}, [x2], #16 // load inp 90 rev64 v3.16b, v3.16b // byteswap inp 91 ext v3.16b, v3.16b, v3.16b, #8 92 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 93 94Lgmult_neon: 95 // Split the input into v3 and v4. (The upper halves are unused, 96 // so it is okay to leave them alone.) 97 ins v4.d[0], v3.d[1] 98 ext v16.8b, v5.8b, v5.8b, #1 // A1 99 pmull v16.8h, v16.8b, v3.8b // F = A1*B 100 ext v0.8b, v3.8b, v3.8b, #1 // B1 101 pmull v0.8h, v5.8b, v0.8b // E = A*B1 102 ext v17.8b, v5.8b, v5.8b, #2 // A2 103 pmull v17.8h, v17.8b, v3.8b // H = A2*B 104 ext v19.8b, v3.8b, v3.8b, #2 // B2 105 pmull v19.8h, v5.8b, v19.8b // G = A*B2 106 ext v18.8b, v5.8b, v5.8b, #3 // A3 107 eor v16.16b, v16.16b, v0.16b // L = E + F 108 pmull v18.8h, v18.8b, v3.8b // J = A3*B 109 ext v0.8b, v3.8b, v3.8b, #3 // B3 110 eor v17.16b, v17.16b, v19.16b // M = G + H 111 pmull v0.8h, v5.8b, v0.8b // I = A*B3 112 113 // Here we diverge from the 32-bit version. It computes the following 114 // (instructions reordered for clarity): 115 // 116 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 117 // vand $t0#hi, $t0#hi, $k48 118 // veor $t0#lo, $t0#lo, $t0#hi 119 // 120 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 121 // vand $t1#hi, $t1#hi, $k32 122 // veor $t1#lo, $t1#lo, $t1#hi 123 // 124 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 125 // vand $t2#hi, $t2#hi, $k16 126 // veor $t2#lo, $t2#lo, $t2#hi 127 // 128 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 129 // vmov.i64 $t3#hi, #0 130 // 131 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 132 // upper halves of SIMD registers, so we must split each half into 133 // separate registers. To compensate, we pair computations up and 134 // parallelize. 135 136 ext v19.8b, v3.8b, v3.8b, #4 // B4 137 eor v18.16b, v18.16b, v0.16b // N = I + J 138 pmull v19.8h, v5.8b, v19.8b // K = A*B4 139 140 // This can probably be scheduled more efficiently. For now, we just 141 // pair up independent instructions. 142 zip1 v20.2d, v16.2d, v17.2d 143 zip1 v22.2d, v18.2d, v19.2d 144 zip2 v21.2d, v16.2d, v17.2d 145 zip2 v23.2d, v18.2d, v19.2d 146 eor v20.16b, v20.16b, v21.16b 147 eor v22.16b, v22.16b, v23.16b 148 and v21.16b, v21.16b, v24.16b 149 and v23.16b, v23.16b, v25.16b 150 eor v20.16b, v20.16b, v21.16b 151 eor v22.16b, v22.16b, v23.16b 152 zip1 v16.2d, v20.2d, v21.2d 153 zip1 v18.2d, v22.2d, v23.2d 154 zip2 v17.2d, v20.2d, v21.2d 155 zip2 v19.2d, v22.2d, v23.2d 156 157 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 158 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 159 pmull v0.8h, v5.8b, v3.8b // D = A*B 160 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 161 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 162 eor v16.16b, v16.16b, v17.16b 163 eor v18.16b, v18.16b, v19.16b 164 eor v0.16b, v0.16b, v16.16b 165 eor v0.16b, v0.16b, v18.16b 166 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 167 ext v16.8b, v7.8b, v7.8b, #1 // A1 168 pmull v16.8h, v16.8b, v3.8b // F = A1*B 169 ext v1.8b, v3.8b, v3.8b, #1 // B1 170 pmull v1.8h, v7.8b, v1.8b // E = A*B1 171 ext v17.8b, v7.8b, v7.8b, #2 // A2 172 pmull v17.8h, v17.8b, v3.8b // H = A2*B 173 ext v19.8b, v3.8b, v3.8b, #2 // B2 174 pmull v19.8h, v7.8b, v19.8b // G = A*B2 175 ext v18.8b, v7.8b, v7.8b, #3 // A3 176 eor v16.16b, v16.16b, v1.16b // L = E + F 177 pmull v18.8h, v18.8b, v3.8b // J = A3*B 178 ext v1.8b, v3.8b, v3.8b, #3 // B3 179 eor v17.16b, v17.16b, v19.16b // M = G + H 180 pmull v1.8h, v7.8b, v1.8b // I = A*B3 181 182 // Here we diverge from the 32-bit version. It computes the following 183 // (instructions reordered for clarity): 184 // 185 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 186 // vand $t0#hi, $t0#hi, $k48 187 // veor $t0#lo, $t0#lo, $t0#hi 188 // 189 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 190 // vand $t1#hi, $t1#hi, $k32 191 // veor $t1#lo, $t1#lo, $t1#hi 192 // 193 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 194 // vand $t2#hi, $t2#hi, $k16 195 // veor $t2#lo, $t2#lo, $t2#hi 196 // 197 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 198 // vmov.i64 $t3#hi, #0 199 // 200 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 201 // upper halves of SIMD registers, so we must split each half into 202 // separate registers. To compensate, we pair computations up and 203 // parallelize. 204 205 ext v19.8b, v3.8b, v3.8b, #4 // B4 206 eor v18.16b, v18.16b, v1.16b // N = I + J 207 pmull v19.8h, v7.8b, v19.8b // K = A*B4 208 209 // This can probably be scheduled more efficiently. For now, we just 210 // pair up independent instructions. 211 zip1 v20.2d, v16.2d, v17.2d 212 zip1 v22.2d, v18.2d, v19.2d 213 zip2 v21.2d, v16.2d, v17.2d 214 zip2 v23.2d, v18.2d, v19.2d 215 eor v20.16b, v20.16b, v21.16b 216 eor v22.16b, v22.16b, v23.16b 217 and v21.16b, v21.16b, v24.16b 218 and v23.16b, v23.16b, v25.16b 219 eor v20.16b, v20.16b, v21.16b 220 eor v22.16b, v22.16b, v23.16b 221 zip1 v16.2d, v20.2d, v21.2d 222 zip1 v18.2d, v22.2d, v23.2d 223 zip2 v17.2d, v20.2d, v21.2d 224 zip2 v19.2d, v22.2d, v23.2d 225 226 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 227 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 228 pmull v1.8h, v7.8b, v3.8b // D = A*B 229 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 230 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 231 eor v16.16b, v16.16b, v17.16b 232 eor v18.16b, v18.16b, v19.16b 233 eor v1.16b, v1.16b, v16.16b 234 eor v1.16b, v1.16b, v18.16b 235 ext v16.8b, v6.8b, v6.8b, #1 // A1 236 pmull v16.8h, v16.8b, v4.8b // F = A1*B 237 ext v2.8b, v4.8b, v4.8b, #1 // B1 238 pmull v2.8h, v6.8b, v2.8b // E = A*B1 239 ext v17.8b, v6.8b, v6.8b, #2 // A2 240 pmull v17.8h, v17.8b, v4.8b // H = A2*B 241 ext v19.8b, v4.8b, v4.8b, #2 // B2 242 pmull v19.8h, v6.8b, v19.8b // G = A*B2 243 ext v18.8b, v6.8b, v6.8b, #3 // A3 244 eor v16.16b, v16.16b, v2.16b // L = E + F 245 pmull v18.8h, v18.8b, v4.8b // J = A3*B 246 ext v2.8b, v4.8b, v4.8b, #3 // B3 247 eor v17.16b, v17.16b, v19.16b // M = G + H 248 pmull v2.8h, v6.8b, v2.8b // I = A*B3 249 250 // Here we diverge from the 32-bit version. It computes the following 251 // (instructions reordered for clarity): 252 // 253 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 254 // vand $t0#hi, $t0#hi, $k48 255 // veor $t0#lo, $t0#lo, $t0#hi 256 // 257 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 258 // vand $t1#hi, $t1#hi, $k32 259 // veor $t1#lo, $t1#lo, $t1#hi 260 // 261 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 262 // vand $t2#hi, $t2#hi, $k16 263 // veor $t2#lo, $t2#lo, $t2#hi 264 // 265 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 266 // vmov.i64 $t3#hi, #0 267 // 268 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 269 // upper halves of SIMD registers, so we must split each half into 270 // separate registers. To compensate, we pair computations up and 271 // parallelize. 272 273 ext v19.8b, v4.8b, v4.8b, #4 // B4 274 eor v18.16b, v18.16b, v2.16b // N = I + J 275 pmull v19.8h, v6.8b, v19.8b // K = A*B4 276 277 // This can probably be scheduled more efficiently. For now, we just 278 // pair up independent instructions. 279 zip1 v20.2d, v16.2d, v17.2d 280 zip1 v22.2d, v18.2d, v19.2d 281 zip2 v21.2d, v16.2d, v17.2d 282 zip2 v23.2d, v18.2d, v19.2d 283 eor v20.16b, v20.16b, v21.16b 284 eor v22.16b, v22.16b, v23.16b 285 and v21.16b, v21.16b, v24.16b 286 and v23.16b, v23.16b, v25.16b 287 eor v20.16b, v20.16b, v21.16b 288 eor v22.16b, v22.16b, v23.16b 289 zip1 v16.2d, v20.2d, v21.2d 290 zip1 v18.2d, v22.2d, v23.2d 291 zip2 v17.2d, v20.2d, v21.2d 292 zip2 v19.2d, v22.2d, v23.2d 293 294 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 295 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 296 pmull v2.8h, v6.8b, v4.8b // D = A*B 297 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 298 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 299 eor v16.16b, v16.16b, v17.16b 300 eor v18.16b, v18.16b, v19.16b 301 eor v2.16b, v2.16b, v16.16b 302 eor v2.16b, v2.16b, v18.16b 303 ext v16.16b, v0.16b, v2.16b, #8 304 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 305 eor v1.16b, v1.16b, v2.16b 306 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 307 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 308 // This is a no-op due to the ins instruction below. 309 // ins v2.d[0], v1.d[1] 310 311 // equivalent of reduction_avx from ghash-x86_64.pl 312 shl v17.2d, v0.2d, #57 // 1st phase 313 shl v18.2d, v0.2d, #62 314 eor v18.16b, v18.16b, v17.16b // 315 shl v17.2d, v0.2d, #63 316 eor v18.16b, v18.16b, v17.16b // 317 // Note Xm contains {Xl.d[1], Xh.d[0]}. 318 eor v18.16b, v18.16b, v1.16b 319 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 320 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 321 322 ushr v18.2d, v0.2d, #1 // 2nd phase 323 eor v2.16b, v2.16b,v0.16b 324 eor v0.16b, v0.16b,v18.16b // 325 ushr v18.2d, v18.2d, #6 326 ushr v0.2d, v0.2d, #1 // 327 eor v0.16b, v0.16b, v2.16b // 328 eor v0.16b, v0.16b, v18.16b // 329 330 subs x3, x3, #16 331 bne Loop_neon 332 333 rev64 v0.16b, v0.16b // byteswap Xi and write 334 ext v0.16b, v0.16b, v0.16b, #8 335 st1 {v0.16b}, [x0] 336 337 ret 338 339 340.section .rodata 341.align 4 342Lmasks: 343.quad 0x0000ffffffffffff // k48 344.quad 0x00000000ffffffff // k32 345.quad 0x000000000000ffff // k16 346.quad 0x0000000000000000 // k0 347.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 348.align 2 349.align 2 350#endif 351#endif // !OPENSSL_NO_ASM 352