• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // The input consists of five valid character sets in the Base64 alphabet,
2 // which we need to map back to the 6-bit values they represent.
3 // There are three ranges, two singles, and then there's the rest.
4 //
5 //   #  From       To        LUT  Characters
6 //   1  [0..42]    [255]      #1  invalid input
7 //   2  [43]       [62]       #1  +
8 //   3  [44..46]   [255]      #1  invalid input
9 //   4  [47]       [63]       #1  /
10 //   5  [48..57]   [52..61]   #1  0..9
11 //   6  [58..63]   [255]      #1  invalid input
12 //   7  [64]       [255]      #2  invalid input
13 //   8  [65..90]   [0..25]    #2  A..Z
14 //   9  [91..96]   [255]      #2  invalid input
15 //  10  [97..122]  [26..51]   #2  a..z
16 //  11  [123..126] [255]      #2  invalid input
17 // (12) Everything else => invalid input
18 
19 // The first LUT will use the VTBL instruction (out of range indices are set to
20 // 0 in destination).
21 static const uint8_t dec_lut1[] = {
22 	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
23 	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
24 	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,  62U, 255U, 255U, 255U,  63U,
25 	 52U,  53U,  54U,  55U,  56U,  57U,  58U,  59U,  60U,  61U, 255U, 255U, 255U, 255U, 255U, 255U,
26 };
27 
28 // The second LUT will use the VTBX instruction (out of range indices will be
29 // unchanged in destination). Input [64..126] will be mapped to index [1..63]
30 // in this LUT. Index 0 means that value comes from LUT #1.
31 static const uint8_t dec_lut2[] = {
32 	  0U, 255U,   0U,   1U,   2U,   3U,   4U,   5U,   6U,   7U,   8U,   9U,  10U,  11U,  12U,  13U,
33 	 14U,  15U,  16U,  17U,  18U,  19U,  20U,  21U,  22U,  23U,  24U,  25U, 255U, 255U, 255U, 255U,
34 	255U, 255U,  26U,  27U,  28U,  29U,  30U,  31U,  32U,  33U,  34U,  35U,  36U,  37U,  38U,  39U,
35 	 40U,  41U,  42U,  43U,  44U,  45U,  46U,  47U,  48U,  49U,  50U,  51U, 255U, 255U, 255U, 255U,
36 };
37 
38 // All input values in range for the first look-up will be 0U in the second
39 // look-up result. All input values out of range for the first look-up will be
40 // 0U in the first look-up result. Thus, the two results can be ORed without
41 // conflicts.
42 //
43 // Invalid characters that are in the valid range for either look-up will be
44 // set to 255U in the combined result. Other invalid characters will just be
45 // passed through with the second look-up result (using the VTBX instruction).
46 // Since the second LUT is 64 bytes, those passed-through values are guaranteed
47 // to have a value greater than 63U. Therefore, valid characters will be mapped
48 // to the valid [0..63] range and all invalid characters will be mapped to
49 // values greater than 63.
50 
51 static inline void
dec_loop_neon64(const uint8_t ** s,size_t * slen,uint8_t ** o,size_t * olen)52 dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
53 {
54 	if (*slen < 64) {
55 		return;
56 	}
57 
58 	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
59 	// extra trailing zero bytes are written, so it is not necessary to
60 	// reserve extra input bytes:
61 	size_t rounds = *slen / 64;
62 
63 	*slen -= rounds * 64;	// 64 bytes consumed per round
64 	*olen += rounds * 48;	// 48 bytes produced per round
65 
66 	const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
67 	const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
68 
69 	do {
70 		const uint8x16_t offset = vdupq_n_u8(63U);
71 		uint8x16x4_t dec1, dec2;
72 		uint8x16x3_t dec;
73 
74 		// Load 64 bytes and deinterleave:
75 		uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
76 
77 		// Get indices for second LUT:
78 		dec2.val[0] = vqsubq_u8(str.val[0], offset);
79 		dec2.val[1] = vqsubq_u8(str.val[1], offset);
80 		dec2.val[2] = vqsubq_u8(str.val[2], offset);
81 		dec2.val[3] = vqsubq_u8(str.val[3], offset);
82 
83 		// Get values from first LUT:
84 		dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
85 		dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
86 		dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
87 		dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
88 
89 		// Get values from second LUT:
90 		dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
91 		dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
92 		dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
93 		dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
94 
95 		// Get final values:
96 		str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
97 		str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
98 		str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
99 		str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
100 
101 		// Check for invalid input, any value larger than 63:
102 		const uint8x16_t classified
103 			= vcgtq_u8(str.val[0], vdupq_n_u8(63))
104 			| vcgtq_u8(str.val[1], vdupq_n_u8(63))
105 			| vcgtq_u8(str.val[2], vdupq_n_u8(63))
106 			| vcgtq_u8(str.val[3], vdupq_n_u8(63));
107 
108 		// Check that all bits are zero:
109 		if (vmaxvq_u8(classified) != 0U) {
110 			break;
111 		}
112 
113 		// Compress four bytes into three:
114 		dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
115 		dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
116 		dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
117 
118 		// Interleave and store decoded result:
119 		vst3q_u8((uint8_t *) *o, dec);
120 
121 		*s += 64;
122 		*o += 48;
123 
124 	} while (--rounds > 0);
125 
126 	// Adjust for any rounds that were skipped:
127 	*slen += rounds * 64;
128 	*olen -= rounds * 48;
129 }
130