1 // The input consists of six character sets in the Base64 alphabet, which we
2 // need to map back to the 6-bit values they represent. There are three ranges,
3 // two singles, and then there's the rest.
4 //
5 // # From To Add Characters
6 // 1 [43] [62] +19 +
7 // 2 [47] [63] +16 /
8 // 3 [48..57] [52..61] +4 0..9
9 // 4 [65..90] [0..25] -65 A..Z
10 // 5 [97..122] [26..51] -71 a..z
11 // (6) Everything else => invalid input
12 //
13 // We will use lookup tables for character validation and offset computation.
14 // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
15 // allows to mask with 0x2F instead of 0x0F and thus save one constant
16 // declaration (register and/or memory access).
17 //
18 // For offsets:
19 // Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
20 // 0000 = garbage
21 // 0001 = /
22 // 0010 = +
23 // 0011 = 0-9
24 // 0100 = A-Z
25 // 0101 = A-Z
26 // 0110 = a-z
27 // 0111 = a-z
28 // 1000 >= garbage
29 //
30 // For validation, here's the table.
31 // A character is valid if and only if the AND of the 2 lookups equals 0:
32 //
33 // hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
34 // LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
35 //
36 // 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
37 // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
38 //
39 // 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
40 // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
41 //
42 // 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
43 // andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
44 //
45 // 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
46 // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
47 //
48 // 0100 0x04 char @ A B C D E F G H I J K L M N O
49 // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
50 //
51 // 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
52 // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
53 //
54 // 0110 0x04 char ` a b c d e f g h i j k l m n o
55 // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
56 // 0111 0x08 char p q r s t u v w x y z { | } ~
57 // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
58 //
59 // 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
60 // 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
61 // 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
62 // 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
63 // 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
64 // 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
65 // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
66 // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
67
68 static inline int
dec_loop_ssse3_inner(const uint8_t ** s,uint8_t ** o,size_t * rounds)69 dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
70 {
71 const __m128i lut_lo = _mm_setr_epi8(
72 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
73 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
74
75 const __m128i lut_hi = _mm_setr_epi8(
76 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
77 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
78
79 const __m128i lut_roll = _mm_setr_epi8(
80 0, 16, 19, 4, -65, -65, -71, -71,
81 0, 0, 0, 0, 0, 0, 0, 0);
82
83 const __m128i mask_2F = _mm_set1_epi8(0x2F);
84
85 // Load input:
86 __m128i str = _mm_loadu_si128((__m128i *) *s);
87
88 // Table lookups:
89 const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
90 const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
91 const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
92 const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
93
94 // Check for invalid input: if any "and" values from lo and hi are not
95 // zero, fall back on bytewise code to do error checking and reporting:
96 if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
97 return 0;
98 }
99
100 const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
101 const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
102
103 // Now simply add the delta values to the input:
104 str = _mm_add_epi8(str, roll);
105
106 // Reshuffle the input to packed 12-byte output format:
107 str = dec_reshuffle(str);
108
109 // Store the output:
110 _mm_storeu_si128((__m128i *) *o, str);
111
112 *s += 16;
113 *o += 12;
114 *rounds -= 1;
115
116 return 1;
117 }
118
119 static inline void
dec_loop_ssse3(const uint8_t ** s,size_t * slen,uint8_t ** o,size_t * olen)120 dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
121 {
122 if (*slen < 24) {
123 return;
124 }
125
126 // Process blocks of 16 bytes per round. Because 4 extra zero bytes are
127 // written after the output, ensure that there will be at least 8 bytes
128 // of input data left to cover the gap. (6 data bytes and up to two
129 // end-of-string markers.)
130 size_t rounds = (*slen - 8) / 16;
131
132 *slen -= rounds * 16; // 16 bytes consumed per round
133 *olen += rounds * 12; // 12 bytes produced per round
134
135 do {
136 if (rounds >= 8) {
137 if (dec_loop_ssse3_inner(s, o, &rounds) &&
138 dec_loop_ssse3_inner(s, o, &rounds) &&
139 dec_loop_ssse3_inner(s, o, &rounds) &&
140 dec_loop_ssse3_inner(s, o, &rounds) &&
141 dec_loop_ssse3_inner(s, o, &rounds) &&
142 dec_loop_ssse3_inner(s, o, &rounds) &&
143 dec_loop_ssse3_inner(s, o, &rounds) &&
144 dec_loop_ssse3_inner(s, o, &rounds)) {
145 continue;
146 }
147 break;
148 }
149 if (rounds >= 4) {
150 if (dec_loop_ssse3_inner(s, o, &rounds) &&
151 dec_loop_ssse3_inner(s, o, &rounds) &&
152 dec_loop_ssse3_inner(s, o, &rounds) &&
153 dec_loop_ssse3_inner(s, o, &rounds)) {
154 continue;
155 }
156 break;
157 }
158 if (rounds >= 2) {
159 if (dec_loop_ssse3_inner(s, o, &rounds) &&
160 dec_loop_ssse3_inner(s, o, &rounds)) {
161 continue;
162 }
163 break;
164 }
165 dec_loop_ssse3_inner(s, o, &rounds);
166 break;
167
168 } while (rounds > 0);
169
170 // Adjust for any rounds that were skipped:
171 *slen += rounds * 16;
172 *olen -= rounds * 12;
173 }
174