1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__arm__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.text 19.fpu neon 20.code 32 21#undef __thumb2__ 22.globl gcm_init_v8 23.hidden gcm_init_v8 24.type gcm_init_v8,%function 25.align 4 26gcm_init_v8: 27 AARCH64_VALID_CALL_TARGET 28 vld1.64 {q9},[r1] @ load input H 29 vmov.i8 q11,#0xe1 30 vshl.i64 q11,q11,#57 @ 0xc2.0 31 vext.8 q3,q9,q9,#8 32 vshr.u64 q10,q11,#63 33 vdup.32 q9,d18[1] 34 vext.8 q8,q10,q11,#8 @ t0=0xc2....01 35 vshr.u64 q10,q3,#63 36 vshr.s32 q9,q9,#31 @ broadcast carry bit 37 vand q10,q10,q8 38 vshl.i64 q3,q3,#1 39 vext.8 q10,q10,q10,#8 40 vand q8,q8,q9 41 vorr q3,q3,q10 @ H<<<=1 42 veor q12,q3,q8 @ twisted H 43 vst1.64 {q12},[r0]! @ store Htable[0] 44 45 @ calculate H^2 46 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing 47.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 48 veor q8,q8,q12 49.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 50.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 51 52 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 53 veor q10,q0,q2 54 veor q1,q1,q9 55 veor q1,q1,q10 56.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase 57 58 vmov d4,d3 @ Xh|Xm - 256-bit result 59 vmov d3,d0 @ Xm is rotated Xl 60 veor q0,q1,q10 61 62 vext.8 q10,q0,q0,#8 @ 2nd phase 63.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 64 veor q10,q10,q2 65 veor q14,q0,q10 66 67 vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing 68 veor q9,q9,q14 69 vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed 70 vst1.64 {q13,q14},[r0] @ store Htable[1..2] 71 72 bx lr 73.size gcm_init_v8,.-gcm_init_v8 74.globl gcm_gmult_v8 75.hidden gcm_gmult_v8 76.type gcm_gmult_v8,%function 77.align 4 78gcm_gmult_v8: 79 AARCH64_VALID_CALL_TARGET 80 vld1.64 {q9},[r0] @ load Xi 81 vmov.i8 q11,#0xe1 82 vld1.64 {q12,q13},[r1] @ load twisted H, ... 83 vshl.u64 q11,q11,#57 84#ifndef __ARMEB__ 85 vrev64.8 q9,q9 86#endif 87 vext.8 q3,q9,q9,#8 88 89.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo 90 veor q9,q9,q3 @ Karatsuba pre-processing 91.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi 92.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 93 94 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 95 veor q10,q0,q2 96 veor q1,q1,q9 97 veor q1,q1,q10 98.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 99 100 vmov d4,d3 @ Xh|Xm - 256-bit result 101 vmov d3,d0 @ Xm is rotated Xl 102 veor q0,q1,q10 103 104 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 105.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 106 veor q10,q10,q2 107 veor q0,q0,q10 108 109#ifndef __ARMEB__ 110 vrev64.8 q0,q0 111#endif 112 vext.8 q0,q0,q0,#8 113 vst1.64 {q0},[r0] @ write out Xi 114 115 bx lr 116.size gcm_gmult_v8,.-gcm_gmult_v8 117.globl gcm_ghash_v8 118.hidden gcm_ghash_v8 119.type gcm_ghash_v8,%function 120.align 4 121gcm_ghash_v8: 122 AARCH64_VALID_CALL_TARGET 123 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so 124 vld1.64 {q0},[r0] @ load [rotated] Xi 125 @ "[rotated]" means that 126 @ loaded value would have 127 @ to be rotated in order to 128 @ make it appear as in 129 @ algorithm specification 130 subs r3,r3,#32 @ see if r3 is 32 or larger 131 mov r12,#16 @ r12 is used as post- 132 @ increment for input pointer; 133 @ as loop is modulo-scheduled 134 @ r12 is zeroed just in time 135 @ to preclude overstepping 136 @ inp[len], which means that 137 @ last block[s] are actually 138 @ loaded twice, but last 139 @ copy is not processed 140 vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 141 vmov.i8 q11,#0xe1 142 vld1.64 {q14},[r1] 143 moveq r12,#0 @ is it time to zero r12? 144 vext.8 q0,q0,q0,#8 @ rotate Xi 145 vld1.64 {q8},[r2]! @ load [rotated] I[0] 146 vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant 147#ifndef __ARMEB__ 148 vrev64.8 q8,q8 149 vrev64.8 q0,q0 150#endif 151 vext.8 q3,q8,q8,#8 @ rotate I[0] 152 blo .Lodd_tail_v8 @ r3 was less than 32 153 vld1.64 {q9},[r2],r12 @ load [rotated] I[1] 154#ifndef __ARMEB__ 155 vrev64.8 q9,q9 156#endif 157 vext.8 q7,q9,q9,#8 158 veor q3,q3,q0 @ I[i]^=Xi 159.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 160 veor q9,q9,q7 @ Karatsuba pre-processing 161.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 162 b .Loop_mod2x_v8 163 164.align 4 165.Loop_mod2x_v8: 166 vext.8 q10,q3,q3,#8 167 subs r3,r3,#32 @ is there more data? 168.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo 169 movlo r12,#0 @ is it time to zero r12? 170 171.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 172 veor q10,q10,q3 @ Karatsuba pre-processing 173.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi 174 veor q0,q0,q4 @ accumulate 175.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 176 vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] 177 178 veor q2,q2,q6 179 moveq r12,#0 @ is it time to zero r12? 180 veor q1,q1,q5 181 182 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 183 veor q10,q0,q2 184 veor q1,q1,q9 185 vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] 186#ifndef __ARMEB__ 187 vrev64.8 q8,q8 188#endif 189 veor q1,q1,q10 190.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 191 192#ifndef __ARMEB__ 193 vrev64.8 q9,q9 194#endif 195 vmov d4,d3 @ Xh|Xm - 256-bit result 196 vmov d3,d0 @ Xm is rotated Xl 197 vext.8 q7,q9,q9,#8 198 vext.8 q3,q8,q8,#8 199 veor q0,q1,q10 200.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 201 veor q3,q3,q2 @ accumulate q3 early 202 203 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 204.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 205 veor q3,q3,q10 206 veor q9,q9,q7 @ Karatsuba pre-processing 207 veor q3,q3,q0 208.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 209 bhs .Loop_mod2x_v8 @ there was at least 32 more bytes 210 211 veor q2,q2,q10 212 vext.8 q3,q8,q8,#8 @ re-construct q3 213 adds r3,r3,#32 @ re-construct r3 214 veor q0,q0,q2 @ re-construct q0 215 beq .Ldone_v8 @ is r3 zero? 216.Lodd_tail_v8: 217 vext.8 q10,q0,q0,#8 218 veor q3,q3,q0 @ inp^=Xi 219 veor q9,q8,q10 @ q9 is rotated inp^Xi 220 221.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo 222 veor q9,q9,q3 @ Karatsuba pre-processing 223.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi 224.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 225 226 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 227 veor q10,q0,q2 228 veor q1,q1,q9 229 veor q1,q1,q10 230.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 231 232 vmov d4,d3 @ Xh|Xm - 256-bit result 233 vmov d3,d0 @ Xm is rotated Xl 234 veor q0,q1,q10 235 236 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 237.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 238 veor q10,q10,q2 239 veor q0,q0,q10 240 241.Ldone_v8: 242#ifndef __ARMEB__ 243 vrev64.8 q0,q0 244#endif 245 vext.8 q0,q0,q0,#8 246 vst1.64 {q0},[r0] @ write out Xi 247 248 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so 249 bx lr 250.size gcm_ghash_v8,.-gcm_ghash_v8 251.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 252.align 2 253.align 2 254#endif 255#endif // !OPENSSL_NO_ASM 256.section .note.GNU-stack,"",%progbits 257