1 #ifdef BASE64_NEON32_USE_ASM
2 static inline void
enc_loop_neon32_inner_asm(const uint8_t ** s,uint8_t ** o)3 enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
4 {
5 // This function duplicates the functionality of enc_loop_neon32_inner,
6 // but entirely with inline assembly. This gives a significant speedup
7 // over using NEON intrinsics, which do not always generate very good
8 // code. The logic of the assembly is directly lifted from the
9 // intrinsics version, so it can be used as a guide to this code.
10
11 // Temporary registers, used as scratch space.
12 uint8x16_t tmp0, tmp1, tmp2, tmp3;
13 uint8x16_t mask0, mask1, mask2, mask3;
14
15 // A lookup table containing the absolute offsets for all ranges.
16 const uint8x16_t lut = {
17 65U, 71U, 252U, 252U,
18 252U, 252U, 252U, 252U,
19 252U, 252U, 252U, 252U,
20 237U, 240U, 0U, 0U
21 };
22
23 // Numeric constants.
24 const uint8x16_t n51 = vdupq_n_u8(51);
25 const uint8x16_t n25 = vdupq_n_u8(25);
26 const uint8x16_t n63 = vdupq_n_u8(63);
27
28 __asm__ (
29
30 // Load 48 bytes and deinterleave. The bytes are loaded to
31 // hard-coded registers q12, q13 and q14, to ensure that they
32 // are contiguous. Increment the source pointer.
33 "vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
34 "vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
35
36 // Reshuffle the bytes using temporaries.
37 "vshr.u8 %q[t0], q12, #2 \n\t"
38 "vshr.u8 %q[t1], q13, #4 \n\t"
39 "vshr.u8 %q[t2], q14, #6 \n\t"
40 "vsli.8 %q[t1], q12, #4 \n\t"
41 "vsli.8 %q[t2], q13, #2 \n\t"
42 "vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
43 "vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
44 "vand.u8 %q[t3], q14, %q[n63] \n\t"
45
46 // t0..t3 are the reshuffled inputs. Create LUT indices.
47 "vqsub.u8 q12, %q[t0], %q[n51] \n\t"
48 "vqsub.u8 q13, %q[t1], %q[n51] \n\t"
49 "vqsub.u8 q14, %q[t2], %q[n51] \n\t"
50 "vqsub.u8 q15, %q[t3], %q[n51] \n\t"
51
52 // Create the mask for range #0.
53 "vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
54 "vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
55 "vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
56 "vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
57
58 // Subtract -1 to correct the LUT indices.
59 "vsub.u8 q12, %q[m0] \n\t"
60 "vsub.u8 q13, %q[m1] \n\t"
61 "vsub.u8 q14, %q[m2] \n\t"
62 "vsub.u8 q15, %q[m3] \n\t"
63
64 // Lookup the delta values.
65 "vtbl.u8 d24, {%q[lut]}, d24 \n\t"
66 "vtbl.u8 d25, {%q[lut]}, d25 \n\t"
67 "vtbl.u8 d26, {%q[lut]}, d26 \n\t"
68 "vtbl.u8 d27, {%q[lut]}, d27 \n\t"
69 "vtbl.u8 d28, {%q[lut]}, d28 \n\t"
70 "vtbl.u8 d29, {%q[lut]}, d29 \n\t"
71 "vtbl.u8 d30, {%q[lut]}, d30 \n\t"
72 "vtbl.u8 d31, {%q[lut]}, d31 \n\t"
73
74 // Add the delta values.
75 "vadd.u8 q12, %q[t0] \n\t"
76 "vadd.u8 q13, %q[t1] \n\t"
77 "vadd.u8 q14, %q[t2] \n\t"
78 "vadd.u8 q15, %q[t3] \n\t"
79
80 // Store 64 bytes and interleave. Increment the dest pointer.
81 "vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
82 "vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
83
84 // Outputs (modified).
85 : [src] "+r" (*s),
86 [dst] "+r" (*o),
87 [t0] "=&w" (tmp0),
88 [t1] "=&w" (tmp1),
89 [t2] "=&w" (tmp2),
90 [t3] "=&w" (tmp3),
91 [m0] "=&w" (mask0),
92 [m1] "=&w" (mask1),
93 [m2] "=&w" (mask2),
94 [m3] "=&w" (mask3)
95
96 // Inputs (not modified).
97 : [lut] "w" (lut),
98 [n25] "w" (n25),
99 [n51] "w" (n51),
100 [n63] "w" (n63)
101
102 // Clobbers.
103 : "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
104 "cc", "memory"
105 );
106 }
107 #endif
108
109 static inline void
enc_loop_neon32_inner(const uint8_t ** s,uint8_t ** o)110 enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
111 {
112 #ifdef BASE64_NEON32_USE_ASM
113 enc_loop_neon32_inner_asm(s, o);
114 #else
115 // Load 48 bytes and deinterleave:
116 uint8x16x3_t src = vld3q_u8(*s);
117
118 // Reshuffle:
119 uint8x16x4_t out = enc_reshuffle(src);
120
121 // Translate reshuffled bytes to the Base64 alphabet:
122 out = enc_translate(out);
123
124 // Interleave and store output:
125 vst4q_u8(*o, out);
126
127 *s += 48;
128 *o += 64;
129 #endif
130 }
131
132 static inline void
enc_loop_neon32(const uint8_t ** s,size_t * slen,uint8_t ** o,size_t * olen)133 enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
134 {
135 size_t rounds = *slen / 48;
136
137 *slen -= rounds * 48; // 48 bytes consumed per round
138 *olen += rounds * 64; // 64 bytes produced per round
139
140 while (rounds > 0) {
141 if (rounds >= 8) {
142 enc_loop_neon32_inner(s, o);
143 enc_loop_neon32_inner(s, o);
144 enc_loop_neon32_inner(s, o);
145 enc_loop_neon32_inner(s, o);
146 enc_loop_neon32_inner(s, o);
147 enc_loop_neon32_inner(s, o);
148 enc_loop_neon32_inner(s, o);
149 enc_loop_neon32_inner(s, o);
150 rounds -= 8;
151 continue;
152 }
153 if (rounds >= 4) {
154 enc_loop_neon32_inner(s, o);
155 enc_loop_neon32_inner(s, o);
156 enc_loop_neon32_inner(s, o);
157 enc_loop_neon32_inner(s, o);
158 rounds -= 4;
159 continue;
160 }
161 if (rounds >= 2) {
162 enc_loop_neon32_inner(s, o);
163 enc_loop_neon32_inner(s, o);
164 rounds -= 2;
165 continue;
166 }
167 enc_loop_neon32_inner(s, o);
168 break;
169 }
170 }
171