1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include "ring_core_generated/prefix_symbols_asm.h" 14#include <ring-core/arm_arch.h> 15 16.text 17 18.globl gcm_init_neon 19.hidden gcm_init_neon 20.type gcm_init_neon,%function 21.align 4 22gcm_init_neon: 23 AARCH64_VALID_CALL_TARGET 24 // This function is adapted from gcm_init_v8. xC2 is t3. 25 ld1 {v17.2d}, [x1] // load H 26 movi v19.16b, #0xe1 27 shl v19.2d, v19.2d, #57 // 0xc2.0 28 ext v3.16b, v17.16b, v17.16b, #8 29 ushr v18.2d, v19.2d, #63 30 dup v17.4s, v17.s[1] 31 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 32 ushr v18.2d, v3.2d, #63 33 sshr v17.4s, v17.4s, #31 // broadcast carry bit 34 and v18.16b, v18.16b, v16.16b 35 shl v3.2d, v3.2d, #1 36 ext v18.16b, v18.16b, v18.16b, #8 37 and v16.16b, v16.16b, v17.16b 38 orr v3.16b, v3.16b, v18.16b // H<<<=1 39 eor v5.16b, v3.16b, v16.16b // twisted H 40 st1 {v5.2d}, [x0] // store Htable[0] 41 ret 42.size gcm_init_neon,.-gcm_init_neon 43 44.globl gcm_gmult_neon 45.hidden gcm_gmult_neon 46.type gcm_gmult_neon,%function 47.align 4 48gcm_gmult_neon: 49 AARCH64_VALID_CALL_TARGET 50 ld1 {v3.16b}, [x0] // load Xi 51 ld1 {v5.1d}, [x1], #8 // load twisted H 52 ld1 {v6.1d}, [x1] 53 adrp x9, .Lmasks // load constants 54 add x9, x9, :lo12:.Lmasks 55 ld1 {v24.2d, v25.2d}, [x9] 56 rev64 v3.16b, v3.16b // byteswap Xi 57 ext v3.16b, v3.16b, v3.16b, #8 58 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 59 60 mov x3, #16 61 b .Lgmult_neon 62.size gcm_gmult_neon,.-gcm_gmult_neon 63 64.globl gcm_ghash_neon 65.hidden gcm_ghash_neon 66.type gcm_ghash_neon,%function 67.align 4 68gcm_ghash_neon: 69 AARCH64_VALID_CALL_TARGET 70 ld1 {v0.16b}, [x0] // load Xi 71 ld1 {v5.1d}, [x1], #8 // load twisted H 72 ld1 {v6.1d}, [x1] 73 adrp x9, .Lmasks // load constants 74 add x9, x9, :lo12:.Lmasks 75 ld1 {v24.2d, v25.2d}, [x9] 76 rev64 v0.16b, v0.16b // byteswap Xi 77 ext v0.16b, v0.16b, v0.16b, #8 78 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing 79 80.Loop_neon: 81 ld1 {v3.16b}, [x2], #16 // load inp 82 rev64 v3.16b, v3.16b // byteswap inp 83 ext v3.16b, v3.16b, v3.16b, #8 84 eor v3.16b, v3.16b, v0.16b // inp ^= Xi 85 86.Lgmult_neon: 87 // Split the input into v3 and v4. (The upper halves are unused, 88 // so it is okay to leave them alone.) 89 ins v4.d[0], v3.d[1] 90 ext v16.8b, v5.8b, v5.8b, #1 // A1 91 pmull v16.8h, v16.8b, v3.8b // F = A1*B 92 ext v0.8b, v3.8b, v3.8b, #1 // B1 93 pmull v0.8h, v5.8b, v0.8b // E = A*B1 94 ext v17.8b, v5.8b, v5.8b, #2 // A2 95 pmull v17.8h, v17.8b, v3.8b // H = A2*B 96 ext v19.8b, v3.8b, v3.8b, #2 // B2 97 pmull v19.8h, v5.8b, v19.8b // G = A*B2 98 ext v18.8b, v5.8b, v5.8b, #3 // A3 99 eor v16.16b, v16.16b, v0.16b // L = E + F 100 pmull v18.8h, v18.8b, v3.8b // J = A3*B 101 ext v0.8b, v3.8b, v3.8b, #3 // B3 102 eor v17.16b, v17.16b, v19.16b // M = G + H 103 pmull v0.8h, v5.8b, v0.8b // I = A*B3 104 105 // Here we diverge from the 32-bit version. It computes the following 106 // (instructions reordered for clarity): 107 // 108 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 109 // vand $t0#hi, $t0#hi, $k48 110 // veor $t0#lo, $t0#lo, $t0#hi 111 // 112 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 113 // vand $t1#hi, $t1#hi, $k32 114 // veor $t1#lo, $t1#lo, $t1#hi 115 // 116 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 117 // vand $t2#hi, $t2#hi, $k16 118 // veor $t2#lo, $t2#lo, $t2#hi 119 // 120 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 121 // vmov.i64 $t3#hi, #0 122 // 123 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 124 // upper halves of SIMD registers, so we must split each half into 125 // separate registers. To compensate, we pair computations up and 126 // parallelize. 127 128 ext v19.8b, v3.8b, v3.8b, #4 // B4 129 eor v18.16b, v18.16b, v0.16b // N = I + J 130 pmull v19.8h, v5.8b, v19.8b // K = A*B4 131 132 // This can probably be scheduled more efficiently. For now, we just 133 // pair up independent instructions. 134 zip1 v20.2d, v16.2d, v17.2d 135 zip1 v22.2d, v18.2d, v19.2d 136 zip2 v21.2d, v16.2d, v17.2d 137 zip2 v23.2d, v18.2d, v19.2d 138 eor v20.16b, v20.16b, v21.16b 139 eor v22.16b, v22.16b, v23.16b 140 and v21.16b, v21.16b, v24.16b 141 and v23.16b, v23.16b, v25.16b 142 eor v20.16b, v20.16b, v21.16b 143 eor v22.16b, v22.16b, v23.16b 144 zip1 v16.2d, v20.2d, v21.2d 145 zip1 v18.2d, v22.2d, v23.2d 146 zip2 v17.2d, v20.2d, v21.2d 147 zip2 v19.2d, v22.2d, v23.2d 148 149 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 150 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 151 pmull v0.8h, v5.8b, v3.8b // D = A*B 152 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 153 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 154 eor v16.16b, v16.16b, v17.16b 155 eor v18.16b, v18.16b, v19.16b 156 eor v0.16b, v0.16b, v16.16b 157 eor v0.16b, v0.16b, v18.16b 158 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing 159 ext v16.8b, v7.8b, v7.8b, #1 // A1 160 pmull v16.8h, v16.8b, v3.8b // F = A1*B 161 ext v1.8b, v3.8b, v3.8b, #1 // B1 162 pmull v1.8h, v7.8b, v1.8b // E = A*B1 163 ext v17.8b, v7.8b, v7.8b, #2 // A2 164 pmull v17.8h, v17.8b, v3.8b // H = A2*B 165 ext v19.8b, v3.8b, v3.8b, #2 // B2 166 pmull v19.8h, v7.8b, v19.8b // G = A*B2 167 ext v18.8b, v7.8b, v7.8b, #3 // A3 168 eor v16.16b, v16.16b, v1.16b // L = E + F 169 pmull v18.8h, v18.8b, v3.8b // J = A3*B 170 ext v1.8b, v3.8b, v3.8b, #3 // B3 171 eor v17.16b, v17.16b, v19.16b // M = G + H 172 pmull v1.8h, v7.8b, v1.8b // I = A*B3 173 174 // Here we diverge from the 32-bit version. It computes the following 175 // (instructions reordered for clarity): 176 // 177 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 178 // vand $t0#hi, $t0#hi, $k48 179 // veor $t0#lo, $t0#lo, $t0#hi 180 // 181 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 182 // vand $t1#hi, $t1#hi, $k32 183 // veor $t1#lo, $t1#lo, $t1#hi 184 // 185 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 186 // vand $t2#hi, $t2#hi, $k16 187 // veor $t2#lo, $t2#lo, $t2#hi 188 // 189 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 190 // vmov.i64 $t3#hi, #0 191 // 192 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 193 // upper halves of SIMD registers, so we must split each half into 194 // separate registers. To compensate, we pair computations up and 195 // parallelize. 196 197 ext v19.8b, v3.8b, v3.8b, #4 // B4 198 eor v18.16b, v18.16b, v1.16b // N = I + J 199 pmull v19.8h, v7.8b, v19.8b // K = A*B4 200 201 // This can probably be scheduled more efficiently. For now, we just 202 // pair up independent instructions. 203 zip1 v20.2d, v16.2d, v17.2d 204 zip1 v22.2d, v18.2d, v19.2d 205 zip2 v21.2d, v16.2d, v17.2d 206 zip2 v23.2d, v18.2d, v19.2d 207 eor v20.16b, v20.16b, v21.16b 208 eor v22.16b, v22.16b, v23.16b 209 and v21.16b, v21.16b, v24.16b 210 and v23.16b, v23.16b, v25.16b 211 eor v20.16b, v20.16b, v21.16b 212 eor v22.16b, v22.16b, v23.16b 213 zip1 v16.2d, v20.2d, v21.2d 214 zip1 v18.2d, v22.2d, v23.2d 215 zip2 v17.2d, v20.2d, v21.2d 216 zip2 v19.2d, v22.2d, v23.2d 217 218 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 219 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 220 pmull v1.8h, v7.8b, v3.8b // D = A*B 221 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 222 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 223 eor v16.16b, v16.16b, v17.16b 224 eor v18.16b, v18.16b, v19.16b 225 eor v1.16b, v1.16b, v16.16b 226 eor v1.16b, v1.16b, v18.16b 227 ext v16.8b, v6.8b, v6.8b, #1 // A1 228 pmull v16.8h, v16.8b, v4.8b // F = A1*B 229 ext v2.8b, v4.8b, v4.8b, #1 // B1 230 pmull v2.8h, v6.8b, v2.8b // E = A*B1 231 ext v17.8b, v6.8b, v6.8b, #2 // A2 232 pmull v17.8h, v17.8b, v4.8b // H = A2*B 233 ext v19.8b, v4.8b, v4.8b, #2 // B2 234 pmull v19.8h, v6.8b, v19.8b // G = A*B2 235 ext v18.8b, v6.8b, v6.8b, #3 // A3 236 eor v16.16b, v16.16b, v2.16b // L = E + F 237 pmull v18.8h, v18.8b, v4.8b // J = A3*B 238 ext v2.8b, v4.8b, v4.8b, #3 // B3 239 eor v17.16b, v17.16b, v19.16b // M = G + H 240 pmull v2.8h, v6.8b, v2.8b // I = A*B3 241 242 // Here we diverge from the 32-bit version. It computes the following 243 // (instructions reordered for clarity): 244 // 245 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) 246 // vand $t0#hi, $t0#hi, $k48 247 // veor $t0#lo, $t0#lo, $t0#hi 248 // 249 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) 250 // vand $t1#hi, $t1#hi, $k32 251 // veor $t1#lo, $t1#lo, $t1#hi 252 // 253 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) 254 // vand $t2#hi, $t2#hi, $k16 255 // veor $t2#lo, $t2#lo, $t2#hi 256 // 257 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) 258 // vmov.i64 $t3#hi, #0 259 // 260 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on 261 // upper halves of SIMD registers, so we must split each half into 262 // separate registers. To compensate, we pair computations up and 263 // parallelize. 264 265 ext v19.8b, v4.8b, v4.8b, #4 // B4 266 eor v18.16b, v18.16b, v2.16b // N = I + J 267 pmull v19.8h, v6.8b, v19.8b // K = A*B4 268 269 // This can probably be scheduled more efficiently. For now, we just 270 // pair up independent instructions. 271 zip1 v20.2d, v16.2d, v17.2d 272 zip1 v22.2d, v18.2d, v19.2d 273 zip2 v21.2d, v16.2d, v17.2d 274 zip2 v23.2d, v18.2d, v19.2d 275 eor v20.16b, v20.16b, v21.16b 276 eor v22.16b, v22.16b, v23.16b 277 and v21.16b, v21.16b, v24.16b 278 and v23.16b, v23.16b, v25.16b 279 eor v20.16b, v20.16b, v21.16b 280 eor v22.16b, v22.16b, v23.16b 281 zip1 v16.2d, v20.2d, v21.2d 282 zip1 v18.2d, v22.2d, v23.2d 283 zip2 v17.2d, v20.2d, v21.2d 284 zip2 v19.2d, v22.2d, v23.2d 285 286 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 287 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 288 pmull v2.8h, v6.8b, v4.8b // D = A*B 289 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 290 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 291 eor v16.16b, v16.16b, v17.16b 292 eor v18.16b, v18.16b, v19.16b 293 eor v2.16b, v2.16b, v16.16b 294 eor v2.16b, v2.16b, v18.16b 295 ext v16.16b, v0.16b, v2.16b, #8 296 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing 297 eor v1.16b, v1.16b, v2.16b 298 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi 299 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result 300 // This is a no-op due to the ins instruction below. 301 // ins v2.d[0], v1.d[1] 302 303 // equivalent of reduction_avx from ghash-x86_64.pl 304 shl v17.2d, v0.2d, #57 // 1st phase 305 shl v18.2d, v0.2d, #62 306 eor v18.16b, v18.16b, v17.16b // 307 shl v17.2d, v0.2d, #63 308 eor v18.16b, v18.16b, v17.16b // 309 // Note Xm contains {Xl.d[1], Xh.d[0]}. 310 eor v18.16b, v18.16b, v1.16b 311 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] 312 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] 313 314 ushr v18.2d, v0.2d, #1 // 2nd phase 315 eor v2.16b, v2.16b,v0.16b 316 eor v0.16b, v0.16b,v18.16b // 317 ushr v18.2d, v18.2d, #6 318 ushr v0.2d, v0.2d, #1 // 319 eor v0.16b, v0.16b, v2.16b // 320 eor v0.16b, v0.16b, v18.16b // 321 322 subs x3, x3, #16 323 bne .Loop_neon 324 325 rev64 v0.16b, v0.16b // byteswap Xi and write 326 ext v0.16b, v0.16b, v0.16b, #8 327 st1 {v0.16b}, [x0] 328 329 ret 330.size gcm_ghash_neon,.-gcm_ghash_neon 331 332.section .rodata 333.align 4 334.Lmasks: 335.quad 0x0000ffffffffffff // k48 336.quad 0x00000000ffffffff // k32 337.quad 0x000000000000ffff // k16 338.quad 0x0000000000000000 // k0 339.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 340.align 2 341.align 2 342#endif 343#endif // !OPENSSL_NO_ASM 344.section .note.GNU-stack,"",%progbits 345