1 // The input consists of five valid character sets in the Base64 alphabet,
2 // which we need to map back to the 6-bit values they represent.
3 // There are three ranges, two singles, and then there's the rest.
4 //
5 // # From To LUT Characters
6 // 1 [0..42] [255] #1 invalid input
7 // 2 [43] [62] #1 +
8 // 3 [44..46] [255] #1 invalid input
9 // 4 [47] [63] #1 /
10 // 5 [48..57] [52..61] #1 0..9
11 // 6 [58..63] [255] #1 invalid input
12 // 7 [64] [255] #2 invalid input
13 // 8 [65..90] [0..25] #2 A..Z
14 // 9 [91..96] [255] #2 invalid input
15 // 10 [97..122] [26..51] #2 a..z
16 // 11 [123..126] [255] #2 invalid input
17 // (12) Everything else => invalid input
18
19 // The first LUT will use the VTBL instruction (out of range indices are set to
20 // 0 in destination).
21 static const uint8_t dec_lut1[] = {
22 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
23 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
24 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
25 52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
26 };
27
28 // The second LUT will use the VTBX instruction (out of range indices will be
29 // unchanged in destination). Input [64..126] will be mapped to index [1..63]
30 // in this LUT. Index 0 means that value comes from LUT #1.
31 static const uint8_t dec_lut2[] = {
32 0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
33 14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
34 255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
35 40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
36 };
37
38 // All input values in range for the first look-up will be 0U in the second
39 // look-up result. All input values out of range for the first look-up will be
40 // 0U in the first look-up result. Thus, the two results can be ORed without
41 // conflicts.
42 //
43 // Invalid characters that are in the valid range for either look-up will be
44 // set to 255U in the combined result. Other invalid characters will just be
45 // passed through with the second look-up result (using the VTBX instruction).
46 // Since the second LUT is 64 bytes, those passed-through values are guaranteed
47 // to have a value greater than 63U. Therefore, valid characters will be mapped
48 // to the valid [0..63] range and all invalid characters will be mapped to
49 // values greater than 63.
50
51 static inline void
dec_loop_neon64(const uint8_t ** s,size_t * slen,uint8_t ** o,size_t * olen)52 dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
53 {
54 if (*slen < 64) {
55 return;
56 }
57
58 // Process blocks of 64 bytes per round. Unlike the SSE codecs, no
59 // extra trailing zero bytes are written, so it is not necessary to
60 // reserve extra input bytes:
61 size_t rounds = *slen / 64;
62
63 *slen -= rounds * 64; // 64 bytes consumed per round
64 *olen += rounds * 48; // 48 bytes produced per round
65
66 const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
67 const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
68
69 do {
70 const uint8x16_t offset = vdupq_n_u8(63U);
71 uint8x16x4_t dec1, dec2;
72 uint8x16x3_t dec;
73
74 // Load 64 bytes and deinterleave:
75 uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
76
77 // Get indices for second LUT:
78 dec2.val[0] = vqsubq_u8(str.val[0], offset);
79 dec2.val[1] = vqsubq_u8(str.val[1], offset);
80 dec2.val[2] = vqsubq_u8(str.val[2], offset);
81 dec2.val[3] = vqsubq_u8(str.val[3], offset);
82
83 // Get values from first LUT:
84 dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
85 dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
86 dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
87 dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
88
89 // Get values from second LUT:
90 dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
91 dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
92 dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
93 dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
94
95 // Get final values:
96 str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
97 str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
98 str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
99 str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
100
101 // Check for invalid input, any value larger than 63:
102 const uint8x16_t classified
103 = vcgtq_u8(str.val[0], vdupq_n_u8(63))
104 | vcgtq_u8(str.val[1], vdupq_n_u8(63))
105 | vcgtq_u8(str.val[2], vdupq_n_u8(63))
106 | vcgtq_u8(str.val[3], vdupq_n_u8(63));
107
108 // Check that all bits are zero:
109 if (vmaxvq_u8(classified) != 0U) {
110 break;
111 }
112
113 // Compress four bytes into three:
114 dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
115 dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
116 dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
117
118 // Interleave and store decoded result:
119 vst3q_u8((uint8_t *) *o, dec);
120
121 *s += 64;
122 *o += 48;
123
124 } while (--rounds > 0);
125
126 // Adjust for any rounds that were skipped:
127 *slen += rounds * 64;
128 *olen -= rounds * 48;
129 }
130