1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if !defined(OPENSSL_NO_ASM) 11#if defined(BORINGSSL_PREFIX) 12#include <boringssl_prefix_symbols_asm.h> 13#endif 14#include <openssl/arm_arch.h> 15 16.text 17 18.code 32 19#undef __thumb2__ 20.globl _gcm_init_v8 21.private_extern _gcm_init_v8 22#ifdef __thumb2__ 23.thumb_func _gcm_init_v8 24#endif 25.align 4 26_gcm_init_v8: 27 vld1.64 {q9},[r1] @ load input H 28 vmov.i8 q11,#0xe1 29 vshl.i64 q11,q11,#57 @ 0xc2.0 30 vext.8 q3,q9,q9,#8 31 vshr.u64 q10,q11,#63 32 vdup.32 q9,d18[1] 33 vext.8 q8,q10,q11,#8 @ t0=0xc2....01 34 vshr.u64 q10,q3,#63 35 vshr.s32 q9,q9,#31 @ broadcast carry bit 36 vand q10,q10,q8 37 vshl.i64 q3,q3,#1 38 vext.8 q10,q10,q10,#8 39 vand q8,q8,q9 40 vorr q3,q3,q10 @ H<<<=1 41 veor q12,q3,q8 @ twisted H 42 vst1.64 {q12},[r0]! @ store Htable[0] 43 44 @ calculate H^2 45 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing 46.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 47 veor q8,q8,q12 48.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 49.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 50 51 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 52 veor q10,q0,q2 53 veor q1,q1,q9 54 veor q1,q1,q10 55.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase 56 57 vmov d4,d3 @ Xh|Xm - 256-bit result 58 vmov d3,d0 @ Xm is rotated Xl 59 veor q0,q1,q10 60 61 vext.8 q10,q0,q0,#8 @ 2nd phase 62.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 63 veor q10,q10,q2 64 veor q14,q0,q10 65 66 vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing 67 veor q9,q9,q14 68 vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed 69 vst1.64 {q13,q14},[r0] @ store Htable[1..2] 70 71 bx lr 72 73.globl _gcm_gmult_v8 74.private_extern _gcm_gmult_v8 75#ifdef __thumb2__ 76.thumb_func _gcm_gmult_v8 77#endif 78.align 4 79_gcm_gmult_v8: 80 vld1.64 {q9},[r0] @ load Xi 81 vmov.i8 q11,#0xe1 82 vld1.64 {q12,q13},[r1] @ load twisted H, ... 83 vshl.u64 q11,q11,#57 84#ifndef __ARMEB__ 85 vrev64.8 q9,q9 86#endif 87 vext.8 q3,q9,q9,#8 88 89.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo 90 veor q9,q9,q3 @ Karatsuba pre-processing 91.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi 92.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 93 94 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 95 veor q10,q0,q2 96 veor q1,q1,q9 97 veor q1,q1,q10 98.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 99 100 vmov d4,d3 @ Xh|Xm - 256-bit result 101 vmov d3,d0 @ Xm is rotated Xl 102 veor q0,q1,q10 103 104 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 105.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 106 veor q10,q10,q2 107 veor q0,q0,q10 108 109#ifndef __ARMEB__ 110 vrev64.8 q0,q0 111#endif 112 vext.8 q0,q0,q0,#8 113 vst1.64 {q0},[r0] @ write out Xi 114 115 bx lr 116 117.globl _gcm_ghash_v8 118.private_extern _gcm_ghash_v8 119#ifdef __thumb2__ 120.thumb_func _gcm_ghash_v8 121#endif 122.align 4 123_gcm_ghash_v8: 124 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so 125 vld1.64 {q0},[r0] @ load [rotated] Xi 126 @ "[rotated]" means that 127 @ loaded value would have 128 @ to be rotated in order to 129 @ make it appear as in 130 @ algorithm specification 131 subs r3,r3,#32 @ see if r3 is 32 or larger 132 mov r12,#16 @ r12 is used as post- 133 @ increment for input pointer; 134 @ as loop is modulo-scheduled 135 @ r12 is zeroed just in time 136 @ to preclude overstepping 137 @ inp[len], which means that 138 @ last block[s] are actually 139 @ loaded twice, but last 140 @ copy is not processed 141 vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 142 vmov.i8 q11,#0xe1 143 vld1.64 {q14},[r1] 144 moveq r12,#0 @ is it time to zero r12? 145 vext.8 q0,q0,q0,#8 @ rotate Xi 146 vld1.64 {q8},[r2]! @ load [rotated] I[0] 147 vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant 148#ifndef __ARMEB__ 149 vrev64.8 q8,q8 150 vrev64.8 q0,q0 151#endif 152 vext.8 q3,q8,q8,#8 @ rotate I[0] 153 blo Lodd_tail_v8 @ r3 was less than 32 154 vld1.64 {q9},[r2],r12 @ load [rotated] I[1] 155#ifndef __ARMEB__ 156 vrev64.8 q9,q9 157#endif 158 vext.8 q7,q9,q9,#8 159 veor q3,q3,q0 @ I[i]^=Xi 160.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 161 veor q9,q9,q7 @ Karatsuba pre-processing 162.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 163 b Loop_mod2x_v8 164 165.align 4 166Loop_mod2x_v8: 167 vext.8 q10,q3,q3,#8 168 subs r3,r3,#32 @ is there more data? 169.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo 170 movlo r12,#0 @ is it time to zero r12? 171 172.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 173 veor q10,q10,q3 @ Karatsuba pre-processing 174.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi 175 veor q0,q0,q4 @ accumulate 176.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 177 vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] 178 179 veor q2,q2,q6 180 moveq r12,#0 @ is it time to zero r12? 181 veor q1,q1,q5 182 183 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 184 veor q10,q0,q2 185 veor q1,q1,q9 186 vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] 187#ifndef __ARMEB__ 188 vrev64.8 q8,q8 189#endif 190 veor q1,q1,q10 191.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 192 193#ifndef __ARMEB__ 194 vrev64.8 q9,q9 195#endif 196 vmov d4,d3 @ Xh|Xm - 256-bit result 197 vmov d3,d0 @ Xm is rotated Xl 198 vext.8 q7,q9,q9,#8 199 vext.8 q3,q8,q8,#8 200 veor q0,q1,q10 201.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 202 veor q3,q3,q2 @ accumulate q3 early 203 204 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 205.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 206 veor q3,q3,q10 207 veor q9,q9,q7 @ Karatsuba pre-processing 208 veor q3,q3,q0 209.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 210 bhs Loop_mod2x_v8 @ there was at least 32 more bytes 211 212 veor q2,q2,q10 213 vext.8 q3,q8,q8,#8 @ re-construct q3 214 adds r3,r3,#32 @ re-construct r3 215 veor q0,q0,q2 @ re-construct q0 216 beq Ldone_v8 @ is r3 zero? 217Lodd_tail_v8: 218 vext.8 q10,q0,q0,#8 219 veor q3,q3,q0 @ inp^=Xi 220 veor q9,q8,q10 @ q9 is rotated inp^Xi 221 222.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo 223 veor q9,q9,q3 @ Karatsuba pre-processing 224.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi 225.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 226 227 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 228 veor q10,q0,q2 229 veor q1,q1,q9 230 veor q1,q1,q10 231.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction 232 233 vmov d4,d3 @ Xh|Xm - 256-bit result 234 vmov d3,d0 @ Xm is rotated Xl 235 veor q0,q1,q10 236 237 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 238.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 239 veor q10,q10,q2 240 veor q0,q0,q10 241 242Ldone_v8: 243#ifndef __ARMEB__ 244 vrev64.8 q0,q0 245#endif 246 vext.8 q0,q0,q0,#8 247 vst1.64 {q0},[r0] @ write out Xi 248 249 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so 250 bx lr 251 252.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 253.align 2 254.align 2 255#endif // !OPENSSL_NO_ASM 256