1 /***
2 This file is part of udev, forked from systemd.
3
4 Copyright 2008-2011 Kay Sievers
5 Copyright 2012 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 /* Parts of this file are based on the GLIB utf8 validation functions. The
22 * original license text follows. */
23
24 /* gutf8.c - Operations on UTF-8 strings.
25 *
26 * Copyright (C) 1999 Tom Tromey
27 * Copyright (C) 2000 Red Hat, Inc.
28 *
29 * This library is free software; you can redistribute it and/or
30 * modify it under the terms of the GNU Library General Public
31 * License as published by the Free Software Foundation; either
32 * version 2 of the License, or (at your option) any later version.
33 *
34 * This library is distributed in the hope that it will be useful,
35 * but WITHOUT ANY WARRANTY; without even the implied warranty of
36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
37 * Library General Public License for more details.
38 *
39 * You should have received a copy of the GNU Library General Public
40 * License along with this library; if not, write to the Free Software
41 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
42 */
43
44 #include <errno.h>
45 #include <stdlib.h>
46 #include <inttypes.h>
47 #include <string.h>
48 #include <stdbool.h>
49
50 #include "utf8.h"
51 #include "util.h"
52
unichar_is_valid(uint32_t ch)53 bool unichar_is_valid(uint32_t ch) {
54
55 if (ch >= 0x110000) /* End of unicode space */
56 return false;
57 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
58 return false;
59 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
60 return false;
61 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
62 return false;
63
64 return true;
65 }
66
unichar_is_control(uint32_t ch)67 static bool unichar_is_control(uint32_t ch) {
68
69 /*
70 0 to ' '-1 is the C0 range.
71 DEL=0x7F, and DEL+1 to 0x9F is C1 range.
72 '\t' is in C0 range, but more or less harmless and commonly used.
73 */
74
75 return (ch < ' ' && ch != '\t' && ch != '\n') ||
76 (0x7F <= ch && ch <= 0x9F);
77 }
78
79 /* count of characters used to encode one unicode char */
utf8_encoded_expected_len(const char * str)80 static int utf8_encoded_expected_len(const char *str) {
81 unsigned char c;
82
83 assert(str);
84
85 c = (unsigned char) str[0];
86 if (c < 0x80)
87 return 1;
88 if ((c & 0xe0) == 0xc0)
89 return 2;
90 if ((c & 0xf0) == 0xe0)
91 return 3;
92 if ((c & 0xf8) == 0xf0)
93 return 4;
94 if ((c & 0xfc) == 0xf8)
95 return 5;
96 if ((c & 0xfe) == 0xfc)
97 return 6;
98
99 return 0;
100 }
101
102 /* decode one unicode char */
utf8_encoded_to_unichar(const char * str)103 int utf8_encoded_to_unichar(const char *str) {
104 int unichar, len, i;
105
106 assert(str);
107
108 len = utf8_encoded_expected_len(str);
109
110 switch (len) {
111 case 1:
112 return (int)str[0];
113 case 2:
114 unichar = str[0] & 0x1f;
115 break;
116 case 3:
117 unichar = (int)str[0] & 0x0f;
118 break;
119 case 4:
120 unichar = (int)str[0] & 0x07;
121 break;
122 case 5:
123 unichar = (int)str[0] & 0x03;
124 break;
125 case 6:
126 unichar = (int)str[0] & 0x01;
127 break;
128 default:
129 return -EINVAL;
130 }
131
132 for (i = 1; i < len; i++) {
133 if (((int)str[i] & 0xc0) != 0x80)
134 return -EINVAL;
135 unichar <<= 6;
136 unichar |= (int)str[i] & 0x3f;
137 }
138
139 return unichar;
140 }
141
utf8_is_printable_newline(const char * str,size_t length,bool newline)142 bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
143 const uint8_t *p;
144
145 assert(str);
146
147 for (p = (const uint8_t*) str; length;) {
148 int encoded_len, val;
149
150 encoded_len = utf8_encoded_valid_unichar((const char *) p);
151 if (encoded_len < 0 ||
152 (size_t) encoded_len > length)
153 return false;
154
155 val = utf8_encoded_to_unichar((const char*) p);
156 if (val < 0 ||
157 unichar_is_control(val) ||
158 (!newline && val == '\n'))
159 return false;
160
161 length -= encoded_len;
162 p += encoded_len;
163 }
164
165 return true;
166 }
167
ascii_is_valid(const char * str)168 char *ascii_is_valid(const char *str) {
169 const char *p;
170
171 assert(str);
172
173 for (p = str; *p; p++)
174 if ((unsigned char) *p >= 128)
175 return NULL;
176
177 return (char*) str;
178 }
179
180 /**
181 * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
182 * @out_utf8: output buffer of at least 4 bytes or NULL
183 * @g: UCS-4 character to encode
184 *
185 * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
186 * The length of the character is returned. It is not zero-terminated! If the
187 * output buffer is NULL, only the length is returned.
188 *
189 * Returns: The length in bytes that the UTF-8 representation does or would
190 * occupy.
191 */
utf8_encode_unichar(char * out_utf8,uint32_t g)192 size_t utf8_encode_unichar(char *out_utf8, uint32_t g) {
193
194 if (g < (1 << 7)) {
195 if (out_utf8)
196 out_utf8[0] = g & 0x7f;
197 return 1;
198 } else if (g < (1 << 11)) {
199 if (out_utf8) {
200 out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
201 out_utf8[1] = 0x80 | (g & 0x3f);
202 }
203 return 2;
204 } else if (g < (1 << 16)) {
205 if (out_utf8) {
206 out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
207 out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
208 out_utf8[2] = 0x80 | (g & 0x3f);
209 }
210 return 3;
211 } else if (g < (1 << 21)) {
212 if (out_utf8) {
213 out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
214 out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
215 out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
216 out_utf8[3] = 0x80 | (g & 0x3f);
217 }
218 return 4;
219 }
220
221 return 0;
222 }
223
utf16_to_utf8(const void * s,size_t length)224 char *utf16_to_utf8(const void *s, size_t length) {
225 const uint8_t *f;
226 char *r, *t;
227
228 r = new(char, (length * 4 + 1) / 2 + 1);
229 if (!r)
230 return NULL;
231
232 f = s;
233 t = r;
234
235 while (f < (const uint8_t*) s + length) {
236 uint16_t w1, w2;
237
238 /* see RFC 2781 section 2.2 */
239
240 w1 = f[1] << 8 | f[0];
241 f += 2;
242
243 if (!utf16_is_surrogate(w1)) {
244 t += utf8_encode_unichar(t, w1);
245
246 continue;
247 }
248
249 if (utf16_is_trailing_surrogate(w1))
250 continue;
251 else if (f >= (const uint8_t*) s + length)
252 break;
253
254 w2 = f[1] << 8 | f[0];
255 f += 2;
256
257 if (!utf16_is_trailing_surrogate(w2)) {
258 f -= 2;
259 continue;
260 }
261
262 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
263 }
264
265 *t = 0;
266 return r;
267 }
268
269 /* expected size used to encode one unicode char */
utf8_unichar_to_encoded_len(int unichar)270 static int utf8_unichar_to_encoded_len(int unichar) {
271
272 if (unichar < 0x80)
273 return 1;
274 if (unichar < 0x800)
275 return 2;
276 if (unichar < 0x10000)
277 return 3;
278 if (unichar < 0x200000)
279 return 4;
280 if (unichar < 0x4000000)
281 return 5;
282
283 return 6;
284 }
285
286 /* validate one encoded unicode char and return its length */
utf8_encoded_valid_unichar(const char * str)287 int utf8_encoded_valid_unichar(const char *str) {
288 int len, unichar, i;
289
290 assert(str);
291
292 len = utf8_encoded_expected_len(str);
293 if (len == 0)
294 return -EINVAL;
295
296 /* ascii is valid */
297 if (len == 1)
298 return 1;
299
300 /* check if expected encoded chars are available */
301 for (i = 0; i < len; i++)
302 if ((str[i] & 0x80) != 0x80)
303 return -EINVAL;
304
305 unichar = utf8_encoded_to_unichar(str);
306
307 /* check if encoded length matches encoded value */
308 if (utf8_unichar_to_encoded_len(unichar) != len)
309 return -EINVAL;
310
311 /* check if value has valid range */
312 if (!unichar_is_valid(unichar))
313 return -EINVAL;
314
315 return len;
316 }
317